+
+# Quickstart
With pip (Python>=3.11):
diff --git a/browser_use/__init__.py b/browser_use/__init__.py
index bac3988eb..e67fe8f4e 100644
--- a/browser_use/__init__.py
+++ b/browser_use/__init__.py
@@ -51,6 +51,7 @@ if TYPE_CHECKING:
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.browser import BrowserSession as Browser
from browser_use.dom.service import DomService
+ from browser_use.llm import models
from browser_use.llm.anthropic.chat import ChatAnthropic
from browser_use.llm.azure.chat import ChatAzureOpenAI
from browser_use.llm.google.chat import ChatGoogle
@@ -85,6 +86,8 @@ _LAZY_IMPORTS = {
'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'),
'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'),
'ChatOllama': ('browser_use.llm.ollama.chat', 'ChatOllama'),
+ # LLM models module
+ 'models': ('browser_use.llm.models', None),
}
@@ -96,7 +99,11 @@ def __getattr__(name: str):
from importlib import import_module
module = import_module(module_path)
- attr = getattr(module, attr_name)
+ if attr_name is None:
+ # For modules like 'models', return the module itself
+ attr = module
+ else:
+ attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
@@ -126,4 +133,6 @@ __all__ = [
'ChatOllama',
'Tools',
'Controller',
+ # LLM models module
+ 'models',
]
diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py
index d0fc5bae5..6bbf0b86f 100644
--- a/browser_use/agent/gif.py
+++ b/browser_use/agent/gif.py
@@ -87,6 +87,8 @@ def create_history_gif(
# Try different font options in order of preference
# ArialUni is a font that comes with Office and can render most non-alphabet characters
font_options = [
+ 'PingFang',
+ 'STHeiti Medium',
'Microsoft YaHei', # 微软雅黑
'SimHei', # 黑体
'SimSun', # 宋体
diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
index e4455186a..5a7f83136 100644
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -17,6 +17,7 @@ from browser_use.browser.views import BrowserStateSummary
from browser_use.filesystem.file_system import FileSystem
from browser_use.llm.messages import (
BaseMessage,
+ ContentPartImageParam,
ContentPartTextParam,
SystemMessage,
)
@@ -108,6 +109,7 @@ class MessageManager:
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
include_tool_call_examples: bool = False,
include_recent_events: bool = False,
+ sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
):
self.task = task
self.state = state
@@ -119,6 +121,7 @@ class MessageManager:
self.vision_detail_level = vision_detail_level
self.include_tool_call_examples = include_tool_call_examples
self.include_recent_events = include_recent_events
+ self.sample_images = sample_images
assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'
@@ -190,10 +193,10 @@ class MessageManager:
logger.debug(f'Added extracted_content to read_state_description: {action_result.extracted_content}')
if action_result.long_term_memory:
- action_results += f'Action {idx + 1}/{result_len}: {action_result.long_term_memory}\n'
+ action_results += f'{action_result.long_term_memory}\n'
logger.debug(f'Added long_term_memory to action_results: {action_result.long_term_memory}')
elif action_result.extracted_content and not action_result.include_extracted_content_only_once:
- action_results += f'Action {idx + 1}/{result_len}: {action_result.extracted_content}\n'
+ action_results += f'{action_result.extracted_content}\n'
logger.debug(f'Added extracted_content to action_results: {action_result.extracted_content}')
if action_result.error:
@@ -201,13 +204,13 @@ class MessageManager:
error_text = action_result.error[:100] + '......' + action_result.error[-100:]
else:
error_text = action_result.error
- action_results += f'Action {idx + 1}/{result_len}: {error_text}\n'
+ action_results += f'{error_text}\n'
logger.debug(f'Added error to action_results: {error_text}')
self.state.read_state_description = self.state.read_state_description.strip('\n')
if action_results:
- action_results = f'Action Results:\n{action_results}'
+ action_results = f'Result:\n{action_results}'
action_results = action_results.strip('\n') if action_results else None
# Build the history item
@@ -306,6 +309,7 @@ class MessageManager:
screenshots=screenshots,
vision_detail_level=self.vision_detail_level,
include_recent_events=self.include_recent_events,
+ sample_images=self.sample_images,
).get_user_message(use_vision)
# Set the state message with caching enabled
diff --git a/browser_use/agent/message_manager/views.py b/browser_use/agent/message_manager/views.py
index 00926abd5..2f601fd07 100644
--- a/browser_use/agent/message_manager/views.py
+++ b/browser_use/agent/message_manager/views.py
@@ -32,30 +32,28 @@ class HistoryItem(BaseModel):
def to_string(self) -> str:
"""Get string representation of the history item"""
- step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
+ step_str = 'step' if self.step_number is not None else 'step_unknown'
if self.error:
return f"""<{step_str}>
{self.error}
{step_str}>"""
elif self.system_message:
- return f"""
-{self.system_message}
-"""
+ return '' # empty string
else:
content_parts = []
# Only include evaluation_previous_goal if it's not None/empty
if self.evaluation_previous_goal:
- content_parts.append(f'Evaluation of Previous Step: {self.evaluation_previous_goal}')
+ content_parts.append(f'{self.evaluation_previous_goal}')
# Always include memory
if self.memory:
- content_parts.append(f'Memory: {self.memory}')
+ content_parts.append(f'{self.memory}')
# Only include next_goal if it's not None/empty
if self.next_goal:
- content_parts.append(f'Next Goal: {self.next_goal}')
+ content_parts.append(f'{self.next_goal}')
if self.action_results:
content_parts.append(self.action_results)
diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py
index d1e859206..33a545fb2 100644
--- a/browser_use/agent/prompts.py
+++ b/browser_use/agent/prompts.py
@@ -93,6 +93,7 @@ class AgentMessagePrompt:
screenshots: list[str] | None = None,
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
include_recent_events: bool = False,
+ sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
):
self.browser_state: 'BrowserStateSummary' = browser_state_summary
self.file_system: 'FileSystem | None' = file_system
@@ -108,6 +109,7 @@ class AgentMessagePrompt:
self.screenshots = screenshots or []
self.vision_detail_level = vision_detail_level
self.include_recent_events = include_recent_events
+ self.sample_images = sample_images or []
assert self.browser_state
@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
@@ -132,8 +134,13 @@ class AgentMessagePrompt:
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1)
- page_info_text = f'Page info: {pi.viewport_width}x{pi.viewport_height}px viewport, {pi.page_width}x{pi.page_height}px total page size, {pages_above:.1f} pages above, {pages_below:.1f} pages below, {total_pages:.1f} total pages, at {current_page_position:.0%} of page'
-
+ page_info_text = ''
+ page_info_text += f'Viewport size: {pi.viewport_width}x{pi.viewport_height}px, Total page size: {pi.page_width}x{pi.page_height}px, '
+ page_info_text += f'{pages_above:.1f} pages above, '
+ page_info_text += f'{pages_below:.1f} pages below, '
+ page_info_text += f'{total_pages:.1f} total pages'
+ page_info_text += '\n'
+ # , at {current_page_position:.0%} of page
if elements_text != '':
if has_content_above:
if self.browser_state.page_info:
@@ -187,19 +194,23 @@ class AgentMessagePrompt:
Available tabs:
{tabs_text}
{page_info_text}
-{recent_events_text}{pdf_message}Interactive elements from top layer of the current page inside the viewport{truncated_text}:
+{recent_events_text}{pdf_message}Elements you can interact with inside the viewport{truncated_text}:
{elements_text}
"""
return browser_state
def _get_agent_state_description(self) -> str:
if self.step_info:
- step_info_description = f'Step {self.step_info.step_number + 1} of {self.step_info.max_steps} max possible steps\n'
+ step_info_description = f'Step {self.step_info.step_number + 1}. Maximum steps: {self.step_info.max_steps}\n'
else:
step_info_description = ''
+
time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
step_info_description += f'Current date and time: {time_str}'
+ time_str = datetime.now().strftime('%Y-%m-%d')
+ step_info_description += f'Current date: {time_str}'
+
_todo_contents = self.file_system.get_todo_contents() if self.file_system else ''
if not len(_todo_contents):
_todo_contents = '[Current todo.md is empty, fill it with your plan when applicable]'
@@ -240,7 +251,7 @@ Available tabs:
state_description = (
'\n'
+ (self.agent_history_description.strip('\n') if self.agent_history_description else '')
- + '\n\n'
+ + '\n\n\n'
)
state_description += '\n' + self._get_agent_state_description().strip('\n') + '\n\n'
state_description += '\n' + self._get_browser_state_description().strip('\n') + '\n\n'
@@ -258,6 +269,9 @@ Available tabs:
# Start with text description
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]
+ # Add sample images
+ content_parts.extend(self.sample_images)
+
# Add screenshots with labels
for i, screenshot in enumerate(self.screenshots):
if i == len(self.screenshots) - 1:
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
index 2f90a0377..67271cb5c 100644
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -4,7 +4,6 @@ import inspect
import json
import logging
import re
-import sys
import tempfile
import time
from collections.abc import Awaitable, Callable
@@ -24,14 +23,14 @@ from browser_use.agent.cloud_events import (
)
from browser_use.agent.message_manager.utils import save_conversation
from browser_use.llm.base import BaseChatModel
-from browser_use.llm.messages import BaseMessage, UserMessage
+from browser_use.llm.messages import BaseMessage, ContentPartImageParam, ContentPartTextParam, UserMessage
from browser_use.llm.openai.chat import ChatOpenAI
from browser_use.tokens.service import TokenCost
load_dotenv()
from bubus import EventBus
-from pydantic import ValidationError
+from pydantic import BaseModel, ValidationError
from uuid_extensions import uuid7str
from browser_use import Browser, BrowserProfile, BrowserSession
@@ -67,6 +66,7 @@ from browser_use.telemetry.views import AgentTelemetryEvent
from browser_use.tools.registry.views import ActionModel
from browser_use.tools.service import Tools
from browser_use.utils import (
+ URL_PATTERN,
_log_pretty_path,
get_browser_use_version,
get_git_info,
@@ -128,7 +128,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
def __init__(
self,
task: str,
- llm: BaseChatModel = ChatOpenAI(model='gpt-4.1-mini'),
+ llm: BaseChatModel | None = None,
# Optional parameters
browser_profile: BrowserProfile | None = None,
browser_session: BrowserSession | None = None,
@@ -179,8 +179,28 @@ class Agent(Generic[Context, AgentStructuredOutput]):
step_timeout: int = 120,
directly_open_url: bool = True,
include_recent_events: bool = False,
+ sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
+ final_response_after_failure: bool = True,
+ _url_shortening_limit: int = 25,
**kwargs,
):
+ if llm is None:
+ default_llm_name = CONFIG.DEFAULT_LLM
+ if default_llm_name:
+ try:
+ from browser_use.llm.models import get_llm_by_name
+
+ llm = get_llm_by_name(default_llm_name)
+ except (ImportError, ValueError) as e:
+ # Use the logger that's already imported at the top of the module
+ logger.warning(
+ f'Failed to create default LLM "{default_llm_name}": {e}. Falling back to ChatOpenAI(model="gpt-4.1-mini")'
+ )
+ llm = ChatOpenAI(model='gpt-4.1-mini')
+ else:
+ # No default LLM specified, use the original default
+ llm = ChatOpenAI(model='gpt-4.1-mini')
+
if page_extraction_llm is None:
page_extraction_llm = llm
if available_file_paths is None:
@@ -210,6 +230,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.llm = llm
self.directly_open_url = directly_open_url
self.include_recent_events = include_recent_events
+ self._url_shortening_limit = _url_shortening_limit
if tools is not None:
self.tools = tools
elif controller is not None:
@@ -224,6 +245,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.sensitive_data = sensitive_data
+ self.sample_images = sample_images
+
self.settings = AgentSettings(
use_vision=use_vision,
vision_detail_level=vision_detail_level,
@@ -243,6 +266,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
include_tool_call_examples=include_tool_call_examples,
llm_timeout=llm_timeout,
step_timeout=step_timeout,
+ final_response_after_failure=final_response_after_failure,
)
# Token cost service
@@ -297,7 +321,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.warning('⚠️ XAI models do not support use_vision=True yet. Setting use_vision=False for now...')
self.settings.use_vision = False
- self.logger.info(f'🧠 Starting a browser-use version {self.version} with model={self.llm.model}')
logger.debug(
f'{" +vision" if self.settings.use_vision else ""}'
f' extraction_model={self.settings.page_extraction_llm.model if self.settings.page_extraction_llm else "Unknown"}'
@@ -330,6 +353,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
vision_detail_level=self.settings.vision_detail_level,
include_tool_call_examples=self.settings.include_tool_call_examples,
include_recent_events=self.include_recent_events,
+ sample_images=self.sample_images,
)
if self.sensitive_data:
@@ -339,23 +363,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# If no allowed_domains are configured, show a security warning
if not self.browser_profile.allowed_domains:
self.logger.error(
- '⚠️⚠️⚠️ Agent(sensitive_data=••••••••) was provided but BrowserSession(allowed_domains=[...]) is not locked down! ⚠️⚠️⚠️\n'
+ '⚠️ Agent(sensitive_data=••••••••) was provided but Browser(allowed_domains=[...]) is not locked down! ⚠️\n'
' ☠️ If the agent visits a malicious website and encounters a prompt-injection attack, your sensitive_data may be exposed!\n\n'
- ' https://docs.browser-use.com/customize/browser-settings#restrict-urls\n'
- 'Waiting 10 seconds before continuing... Press [Ctrl+C] to abort.'
- )
- if sys.stdin.isatty():
- try:
- time.sleep(10)
- except KeyboardInterrupt:
- print(
- '\n\n 🛑 Exiting now... set BrowserSession(allowed_domains=["example.com", "example.org"]) to only domains you trust to see your sensitive_data.'
- )
- sys.exit(0)
- else:
- pass # no point waiting if we're not in an interactive shell
- self.logger.warning(
- '‼️ Continuing with insecure settings for now... but this will become a hard error in the future!'
+ ' \n'
)
# If we're using domain-specific credentials, validate domain patterns
@@ -426,6 +436,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self._last_known_downloads: list[str] = []
self.logger.debug('📁 Initialized download tracking for agent')
+ # Event-based pause control (kept out of AgentState for serialization)
self._external_pause_event = asyncio.Event()
self._external_pause_event.set()
@@ -606,8 +617,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if await self.register_external_agent_status_raise_error_callback():
raise InterruptedError
- if self.state.stopped or self.state.paused:
- # self.logger.debug('Agent paused after getting state')
+ if self.state.stopped:
+ raise InterruptedError
+
+ if self.state.paused:
raise InterruptedError
@observe(name='agent.step', ignore_output=True, ignore_input=True)
@@ -615,6 +628,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
async def step(self, step_info: AgentStepInfo | None = None) -> None:
"""Execute one step of the task"""
# Initialize timing first, before any exceptions can occur
+
self.step_start_time = time.time()
browser_state_summary = None
@@ -682,7 +696,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
available_file_paths=self.available_file_paths, # Always pass current available_file_paths
)
- await self._handle_final_step(step_info)
+ await self._force_done_after_last_step(step_info)
+ await self._force_done_after_failure()
return browser_state_summary
@observe_debug(ignore_input=True, name='get_next_action')
@@ -768,7 +783,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Handle all other exceptions
include_trace = self.logger.isEnabledFor(logging.DEBUG)
error_msg = AgentError.format_error(error, include_trace=include_trace)
- prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures} times:\n '
+ prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures + int(self.settings.final_response_after_failure)} times:\n '
self.state.consecutive_failures += 1
# Handle InterruptedError specially
@@ -833,7 +848,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Increment step counter after step is fully completed
self.state.n_steps += 1
- async def _handle_final_step(self, step_info: AgentStepInfo | None = None) -> None:
+ async def _force_done_after_last_step(self, step_info: AgentStepInfo | None = None) -> None:
"""Handle special processing for the last step"""
if step_info and step_info.is_last_step():
# Add last step warning if needed
@@ -845,6 +860,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self._message_manager._add_context_message(UserMessage(content=msg))
self.AgentOutput = self.DoneAgentOutput
+ async def _force_done_after_failure(self) -> None:
+ """Force done after failure"""
+ # Create recovery message
+ if self.state.consecutive_failures >= self.settings.max_failures and self.settings.final_response_after_failure:
+ msg = f'You have failed {self.settings.max_failures} consecutive times. This is your final step to complete the task or provide what you found. '
+ msg += 'Use only the "done" action now. No other actions - so here your action sequence must have length 1.'
+ msg += '\nIf the task could not be completed due to the failures, set success in "done" to false!'
+ msg += '\nInclude everything you found out for the task in the done text.'
+
+ self.logger.debug('Force done action, because we reached max_failures.')
+ self._message_manager._add_context_message(UserMessage(content=msg))
+ self.AgentOutput = self.DoneAgentOutput
+
async def _get_model_output_with_retry(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get model output with retry logic for empty actions"""
model_output = await self.get_model_output(input_messages)
@@ -965,15 +993,172 @@ class Agent(Generic[Context, AgentStructuredOutput]):
text = re.sub(STRAY_CLOSE_TAG, '', text)
return text.strip()
+ # region - URL replacement
+ def _replace_urls_in_text(self, text: str) -> tuple[str, dict[str, str]]:
+ """Replace URLs in a text string"""
+
+ replaced_urls: dict[str, str] = {}
+
+ def replace_url(match: re.Match) -> str:
+ """Url can only have 1 query and 1 fragment"""
+ import hashlib
+
+ original_url = match.group(0)
+
+ # Find where the query/fragment starts
+ query_start = original_url.find('?')
+ fragment_start = original_url.find('#')
+
+ # Find the earliest position of query or fragment
+ after_path_start = len(original_url) # Default: no query/fragment
+ if query_start != -1:
+ after_path_start = min(after_path_start, query_start)
+ if fragment_start != -1:
+ after_path_start = min(after_path_start, fragment_start)
+
+ # Split URL into base (up to path) and after_path (query + fragment)
+ base_url = original_url[:after_path_start]
+ after_path = original_url[after_path_start:]
+
+ # If after_path is within the limit, don't shorten
+ if len(after_path) <= self._url_shortening_limit:
+ return original_url
+
+ # If after_path is too long, truncate and add hash
+ if after_path:
+ truncated_after_path = after_path[: self._url_shortening_limit]
+ # Create a short hash of the full after_path content
+ hash_obj = hashlib.md5(after_path.encode('utf-8'))
+ short_hash = hash_obj.hexdigest()[:7]
+ # Create shortened URL
+ shortened = f'{base_url}{truncated_after_path}...{short_hash}'
+ # Only use shortened URL if it's actually shorter than the original
+ if len(shortened) < len(original_url):
+ replaced_urls[shortened] = original_url
+ return shortened
+
+ return original_url
+
+ return URL_PATTERN.sub(replace_url, text), replaced_urls
+
+ def _process_messsages_and_replace_long_urls_shorter_ones(self, input_messages: list[BaseMessage]) -> dict[str, str]:
+ """Replace long URLs with shorter ones
+ ? @dev edits input_messages in place
+
+ returns:
+ tuple[filtered_input_messages, urls we replaced {shorter_url: original_url}]
+ """
+ from browser_use.llm.messages import AssistantMessage, UserMessage
+
+ urls_replaced: dict[str, str] = {}
+
+ # Process each message, in place
+ for message in input_messages:
+ # no need to process SystemMessage, we have control over that anyway
+ if isinstance(message, (UserMessage, AssistantMessage)):
+ if isinstance(message.content, str):
+ # Simple string content
+ message.content, replaced_urls = self._replace_urls_in_text(message.content)
+ urls_replaced.update(replaced_urls)
+
+ elif isinstance(message.content, list):
+ # List of content parts
+ for part in message.content:
+ if isinstance(part, ContentPartTextParam):
+ part.text, replaced_urls = self._replace_urls_in_text(part.text)
+ urls_replaced.update(replaced_urls)
+
+ return urls_replaced
+
+ @staticmethod
+ def _recursive_process_all_strings_inside_pydantic_model(model: BaseModel, url_replacements: dict[str, str]) -> None:
+ """Recursively process all strings inside a Pydantic model, replacing shortened URLs with originals in place."""
+ for field_name, field_value in model.__dict__.items():
+ if isinstance(field_value, str):
+ # Replace shortened URLs with original URLs in string
+ processed_string = Agent._replace_shortened_urls_in_string(field_value, url_replacements)
+ setattr(model, field_name, processed_string)
+ elif isinstance(field_value, BaseModel):
+ # Recursively process nested Pydantic models
+ Agent._recursive_process_all_strings_inside_pydantic_model(field_value, url_replacements)
+ elif isinstance(field_value, dict):
+ # Process dictionary values in place
+ Agent._recursive_process_dict(field_value, url_replacements)
+ elif isinstance(field_value, (list, tuple)):
+ processed_value = Agent._recursive_process_list_or_tuple(field_value, url_replacements)
+ setattr(model, field_name, processed_value)
+
+ @staticmethod
+ def _recursive_process_dict(dictionary: dict, url_replacements: dict[str, str]) -> None:
+ """Helper method to process dictionaries."""
+ for k, v in dictionary.items():
+ if isinstance(v, str):
+ dictionary[k] = Agent._replace_shortened_urls_in_string(v, url_replacements)
+ elif isinstance(v, BaseModel):
+ Agent._recursive_process_all_strings_inside_pydantic_model(v, url_replacements)
+ elif isinstance(v, dict):
+ Agent._recursive_process_dict(v, url_replacements)
+ elif isinstance(v, (list, tuple)):
+ dictionary[k] = Agent._recursive_process_list_or_tuple(v, url_replacements)
+
+ @staticmethod
+ def _recursive_process_list_or_tuple(container: list | tuple, url_replacements: dict[str, str]) -> list | tuple:
+ """Helper method to process lists and tuples."""
+ if isinstance(container, tuple):
+ # For tuples, create a new tuple with processed items
+ processed_items = []
+ for item in container:
+ if isinstance(item, str):
+ processed_items.append(Agent._replace_shortened_urls_in_string(item, url_replacements))
+ elif isinstance(item, BaseModel):
+ Agent._recursive_process_all_strings_inside_pydantic_model(item, url_replacements)
+ processed_items.append(item)
+ elif isinstance(item, dict):
+ Agent._recursive_process_dict(item, url_replacements)
+ processed_items.append(item)
+ elif isinstance(item, (list, tuple)):
+ processed_items.append(Agent._recursive_process_list_or_tuple(item, url_replacements))
+ else:
+ processed_items.append(item)
+ return tuple(processed_items)
+ else:
+ # For lists, modify in place
+ for i, item in enumerate(container):
+ if isinstance(item, str):
+ container[i] = Agent._replace_shortened_urls_in_string(item, url_replacements)
+ elif isinstance(item, BaseModel):
+ Agent._recursive_process_all_strings_inside_pydantic_model(item, url_replacements)
+ elif isinstance(item, dict):
+ Agent._recursive_process_dict(item, url_replacements)
+ elif isinstance(item, (list, tuple)):
+ container[i] = Agent._recursive_process_list_or_tuple(item, url_replacements)
+ return container
+
+ @staticmethod
+ def _replace_shortened_urls_in_string(text: str, url_replacements: dict[str, str]) -> str:
+ """Replace all shortened URLs in a string with their original URLs."""
+ result = text
+ for shortened_url, original_url in url_replacements.items():
+ result = result.replace(shortened_url, original_url)
+ return result
+
+ # endregion - URL replacement
+
@time_execution_async('--get_next_action')
@observe_debug(ignore_input=True, ignore_output=True, name='get_model_output')
async def get_model_output(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
+ urls_replaced = self._process_messsages_and_replace_long_urls_shorter_ones(input_messages)
+
try:
response = await self.llm.ainvoke(input_messages, output_format=self.AgentOutput)
parsed = response.completion
+ # Replace any shortened URLs in the LLM response back to original URLs
+ if urls_replaced:
+ self._recursive_process_all_strings_inside_pydantic_model(parsed, urls_replaced)
+
# cut the number of actions to max_actions_per_step if needed
if len(parsed.action) > self.settings.max_actions_per_step:
parsed.action = parsed.action[: self.settings.max_actions_per_step]
@@ -994,6 +1179,11 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.debug(f'🤖 Browser-Use Library Version {self.version} ({self.source})')
+ def _log_first_step_startup(self) -> None:
+ """Log startup message only on the first step"""
+ if len(self.history.history) == 0:
+ self.logger.info(f'🧠 Starting a browser-use version {self.version} with model={self.llm.model}')
+
def _log_step_context(self, browser_state_summary: BrowserStateSummary) -> None:
"""Log step context information"""
url = browser_state_summary.url if browser_state_summary else ''
@@ -1122,6 +1312,11 @@ class Agent(Generic[Context, AgentStructuredOutput]):
Returns:
Tuple[bool, bool]: (is_done, is_valid)
"""
+ if len(self.history.history) == 0:
+ # First step
+ self._log_first_step_startup()
+ await self._execute_initial_actions()
+
await self.step(step_info)
if self.history.is_done():
@@ -1250,17 +1445,21 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.warning('⚠️ No browser focus established, may cause navigation issues')
await self._execute_initial_actions()
+ # Log startup message on first step (only if we haven't already done steps)
+ self._log_first_step_startup()
self.logger.debug(f'🔄 Starting main execution loop with max {max_steps} steps...')
for step in range(max_steps):
- # Replace the polling with clean pause-wait
+ # Use the consolidated pause state management
if self.state.paused:
self.logger.debug(f'⏸️ Step {step}: Agent paused, waiting to resume...')
- await self.wait_until_resumed()
+ await self._external_pause_event.wait()
signal_handler.reset()
- # Check if we should stop due to too many failures
- if self.state.consecutive_failures >= self.settings.max_failures:
+ # Check if we should stop due to too many failures, if final_response_after_failure is True, we try one last time
+ if (self.state.consecutive_failures) >= self.settings.max_failures + int(
+ self.settings.final_response_after_failure
+ ):
self.logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
agent_run_error = f'Stopped due to {self.settings.max_failures} consecutive failures'
break
@@ -1271,12 +1470,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
agent_run_error = 'Agent stopped programmatically'
break
- while self.state.paused:
- await asyncio.sleep(0.5) # Small delay to prevent CPU spinning
- if self.state.stopped: # Allow stopping while paused
- agent_run_error = 'Agent stopped programmatically while paused'
- break
-
if on_step_start is not None:
await on_step_start(self)
@@ -1476,7 +1669,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if orig_target_hash != new_target_hash:
# Get names of remaining actions that won't be executed
remaining_actions_str = get_remaining_actions_str(actions, i)
- msg = f'Page changed after action {i} / {total_actions}: actions {remaining_actions_str} were not executed'
+ msg = f'Page changed after action: actions {remaining_actions_str} are not yet executed'
logger.info(msg)
results.append(
ActionResult(
@@ -1716,39 +1909,28 @@ class Agent(Generic[Context, AgentStructuredOutput]):
file_path = 'AgentHistory.json'
self.history.save_to_file(file_path)
- async def wait_until_resumed(self):
- await self._external_pause_event.wait()
-
def pause(self) -> None:
"""Pause the agent before the next step"""
- print(
- '\n\n⏸️ Got [Ctrl+C], paused the agent and left the browser open.\n\tPress [Enter] to resume or [Ctrl+C] again to quit.'
- )
+ print('\n\n⏸️ Paused the agent and left the browser open.\n\tPress [Enter] to resume or [Ctrl+C] again to quit.')
self.state.paused = True
self._external_pause_event.clear()
- # Task paused
-
- # The signal handler will handle the asyncio pause logic for us
- # No need to duplicate the code here
-
def resume(self) -> None:
"""Resume the agent"""
+ # TODO: Locally the browser got closed
print('----------------------------------------------------------------------')
- print('▶️ Got Enter, resuming agent execution where it left off...\n')
+ print('▶️ Resuming agent execution where it left off...\n')
self.state.paused = False
self._external_pause_event.set()
- # Task resumed
-
- # The signal handler should have already reset the flags
- # through its reset() method when called from run()
-
def stop(self) -> None:
"""Stop the agent"""
self.logger.info('⏹️ Agent stopping')
self.state.stopped = True
+ # Signal pause event to unblock any waiting code so it can check the stopped state
+ self._external_pause_event.set()
+
# Task stopped
def _convert_initial_actions(self, actions: list[dict[str, dict[str, Any]]]) -> list[ActionModel]:
diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index eee392c3d..1b59ac61c 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -61,7 +61,7 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
+- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
diff --git a/browser_use/agent/system_prompt_flash.md b/browser_use/agent/system_prompt_flash.md
index aaf190953..ae1e5eaf6 100644
--- a/browser_use/agent/system_prompt_flash.md
+++ b/browser_use/agent/system_prompt_flash.md
@@ -59,7 +59,7 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
+- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index cd15a06c4..a2ae0c556 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -61,7 +61,7 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
+- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py
index 9efe9528a..89d89cb2d 100644
--- a/browser_use/agent/views.py
+++ b/browser_use/agent/views.py
@@ -39,7 +39,7 @@ class AgentSettings(BaseModel):
override_system_message: str | None = None
extend_system_message: str | None = None
include_attributes: list[str] | None = DEFAULT_INCLUDE_ATTRIBUTES
- max_actions_per_step: int = 10
+ max_actions_per_step: int = 4
use_thinking: bool = True
flash_mode: bool = False # If enabled, disables evaluation_previous_goal and next_goal, and sets use_thinking = False
max_history_items: int | None = None
@@ -49,17 +49,22 @@ class AgentSettings(BaseModel):
include_tool_call_examples: bool = False
llm_timeout: int = 60 # Timeout in seconds for LLM calls
step_timeout: int = 180 # Timeout in seconds for each step
+ final_response_after_failure: bool = True # If True, attempt one final recovery call after max_failures
class AgentState(BaseModel):
"""Holds all state information for an Agent"""
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
agent_id: str = Field(default_factory=uuid7str)
n_steps: int = 1
consecutive_failures: int = 0
last_result: list[ActionResult] | None = None
last_plan: str | None = None
last_model_output: AgentOutput | None = None
+
+ # Pause/resume state (kept serialisable for checkpointing)
paused: bool = False
stopped: bool = False
session_initialized: bool = False # Track if session events have been dispatched
@@ -68,9 +73,6 @@ class AgentState(BaseModel):
message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
file_system_state: FileSystemState | None = None
- # class Config:
- # arbitrary_types_allowed = True
-
@dataclass
class AgentStepInfo:
diff --git a/browser_use/browser/events.py b/browser_use/browser/events.py
index 24c80aee5..f870770f1 100644
--- a/browser_use/browser/events.py
+++ b/browser_use/browser/events.py
@@ -1,6 +1,7 @@
"""Event definitions for browser communication."""
import inspect
+import os
from typing import Any, Literal
from bubus import BaseEvent
@@ -11,6 +12,37 @@ from pydantic import BaseModel, Field, field_validator
from browser_use.browser.views import BrowserStateSummary
from browser_use.dom.views import EnhancedDOMTreeNode
+
+def _get_timeout(env_var: str, default: float) -> float | None:
+ """
+ Safely parse environment variable timeout values with robust error handling.
+
+ Args:
+ env_var: Environment variable name (e.g. 'TIMEOUT_NavigateToUrlEvent')
+ default: Default timeout value as float (e.g. 15.0)
+
+ Returns:
+ Parsed float value or the default if parsing fails
+
+ Raises:
+ ValueError: Only if both env_var and default are invalid (should not happen with valid defaults)
+ """
+ # Try environment variable first
+ env_value = os.getenv(env_var)
+ if env_value:
+ try:
+ parsed = float(env_value)
+ if parsed < 0:
+ print(f'Warning: {env_var}={env_value} is negative, using default {default}')
+ return default
+ return parsed
+ except (ValueError, TypeError):
+ print(f'Warning: {env_var}={env_value} is not a valid number, using default {default}')
+
+ # Fall back to default
+ return default
+
+
# ============================================================================
# Agent/Tools -> BrowserSession Events (High-level browser actions)
# ============================================================================
@@ -88,7 +120,7 @@ class NavigateToUrlEvent(BaseEvent[None]):
# existing_tab: PageHandle | None = None # TODO
# time limits enforced by bubus, not exposed to LLM:
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_NavigateToUrlEvent', 15.0) # seconds
class ClickElementEvent(ElementSelectedEvent[dict[str, Any] | None]):
@@ -103,7 +135,7 @@ class ClickElementEvent(ElementSelectedEvent[dict[str, Any] | None]):
# click_count: int = 1 # TODO
# expect_download: bool = False # moved to downloads_watchdog.py
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_ClickElementEvent', 15.0) # seconds
class TypeTextEvent(ElementSelectedEvent[dict | None]):
@@ -113,7 +145,7 @@ class TypeTextEvent(ElementSelectedEvent[dict | None]):
text: str
clear_existing: bool = True
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_TypeTextEvent', 15.0) # seconds
class ScrollEvent(ElementSelectedEvent[None]):
@@ -123,7 +155,7 @@ class ScrollEvent(ElementSelectedEvent[None]):
amount: int # pixels
node: 'EnhancedDOMTreeNode | None' = None # None means scroll page
- event_timeout: float | None = 8.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_ScrollEvent', 8.0) # seconds
class SwitchTabEvent(BaseEvent[TargetID]):
@@ -131,7 +163,7 @@ class SwitchTabEvent(BaseEvent[TargetID]):
target_id: TargetID | None = Field(default=None, description='None means switch to the most recently opened tab')
- event_timeout: float | None = 10.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_SwitchTabEvent', 10.0) # seconds
class CloseTabEvent(BaseEvent[None]):
@@ -139,7 +171,7 @@ class CloseTabEvent(BaseEvent[None]):
target_id: TargetID
- event_timeout: float | None = 10.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_CloseTabEvent', 10.0) # seconds
class ScreenshotEvent(BaseEvent[str]):
@@ -148,7 +180,7 @@ class ScreenshotEvent(BaseEvent[str]):
full_page: bool = False
clip: dict[str, float] | None = None # {x, y, width, height}
- event_timeout: float | None = 8.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_ScreenshotEvent', 8.0) # seconds
class BrowserStateRequestEvent(BaseEvent[BrowserStateSummary]):
@@ -159,7 +191,7 @@ class BrowserStateRequestEvent(BaseEvent[BrowserStateSummary]):
cache_clickable_elements_hashes: bool = True
include_recent_events: bool = False
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStateRequestEvent', 30.0) # seconds
# class WaitForConditionEvent(BaseEvent):
@@ -174,19 +206,19 @@ class BrowserStateRequestEvent(BaseEvent[BrowserStateSummary]):
class GoBackEvent(BaseEvent[None]):
"""Navigate back in browser history."""
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_GoBackEvent', 15.0) # seconds
class GoForwardEvent(BaseEvent[None]):
"""Navigate forward in browser history."""
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_GoForwardEvent', 15.0) # seconds
class RefreshEvent(BaseEvent[None]):
"""Refresh/reload the current page."""
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_RefreshEvent', 15.0) # seconds
class WaitEvent(BaseEvent[None]):
@@ -195,7 +227,7 @@ class WaitEvent(BaseEvent[None]):
seconds: float = 3.0
max_seconds: float = 10.0 # Safety cap
- event_timeout: float | None = 60.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_WaitEvent', 60.0) # seconds
class SendKeysEvent(BaseEvent[None]):
@@ -203,7 +235,7 @@ class SendKeysEvent(BaseEvent[None]):
keys: str # e.g., "ctrl+a", "cmd+c", "Enter"
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_SendKeysEvent', 15.0) # seconds
class UploadFileEvent(ElementSelectedEvent[None]):
@@ -212,7 +244,7 @@ class UploadFileEvent(ElementSelectedEvent[None]):
node: 'EnhancedDOMTreeNode'
file_path: str
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_UploadFileEvent', 30.0) # seconds
class GetDropdownOptionsEvent(ElementSelectedEvent[dict[str, str]]):
@@ -222,9 +254,10 @@ class GetDropdownOptionsEvent(ElementSelectedEvent[dict[str, str]]):
node: 'EnhancedDOMTreeNode'
- event_timeout: float | None = (
- 15.0 # some dropdowns lazy-load the list of options on first interaction, so we need to wait for them to load (e.g. table filter lists can have thousands of options)
- )
+ event_timeout: float | None = _get_timeout(
+ 'TIMEOUT_GetDropdownOptionsEvent',
+ 15.0,
+ ) # some dropdowns lazy-load the list of options on first interaction, so we need to wait for them to load (e.g. table filter lists can have thousands of options)
class SelectDropdownOptionEvent(ElementSelectedEvent[dict[str, str]]):
@@ -235,7 +268,7 @@ class SelectDropdownOptionEvent(ElementSelectedEvent[dict[str, str]]):
node: 'EnhancedDOMTreeNode'
text: str # The option text to select
- event_timeout: float | None = 8.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_SelectDropdownOptionEvent', 8.0) # seconds
class ScrollToTextEvent(BaseEvent[None]):
@@ -244,7 +277,7 @@ class ScrollToTextEvent(BaseEvent[None]):
text: str
direction: Literal['up', 'down'] = 'down'
- event_timeout: float | None = 15.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_ScrollToTextEvent', 15.0) # seconds
# ============================================================================
@@ -256,7 +289,7 @@ class BrowserStartEvent(BaseEvent):
cdp_url: str | None = None
launch_options: dict[str, Any] = Field(default_factory=dict)
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStartEvent', 30.0) # seconds
class BrowserStopEvent(BaseEvent):
@@ -264,7 +297,7 @@ class BrowserStopEvent(BaseEvent):
force: bool = False
- event_timeout: float | None = 45.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStopEvent', 45.0) # seconds
class BrowserLaunchResult(BaseModel):
@@ -279,13 +312,13 @@ class BrowserLaunchEvent(BaseEvent[BrowserLaunchResult]):
# TODO: add executable_path, proxy settings, preferences, extra launch args, etc.
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserLaunchEvent', 30.0) # seconds
class BrowserKillEvent(BaseEvent):
"""Kill local browser subprocess."""
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserKillEvent', 30.0) # seconds
# TODO: replace all Runtime.evaluate() calls with this event
@@ -338,7 +371,7 @@ class BrowserConnectedEvent(BaseEvent):
cdp_url: str
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserConnectedEvent', 30.0) # seconds
class BrowserStoppedEvent(BaseEvent):
@@ -346,7 +379,7 @@ class BrowserStoppedEvent(BaseEvent):
reason: str | None = None
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStoppedEvent', 30.0) # seconds
class TabCreatedEvent(BaseEvent):
@@ -355,7 +388,7 @@ class TabCreatedEvent(BaseEvent):
target_id: TargetID
url: str
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_TabCreatedEvent', 30.0) # seconds
class TabClosedEvent(BaseEvent):
@@ -367,7 +400,7 @@ class TabClosedEvent(BaseEvent):
# new_focus_target_id: int | None = None
# new_focus_url: str | None = None
- event_timeout: float | None = 10.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_TabClosedEvent', 10.0) # seconds
# TODO: emit this when DOM changes significantly, inner frame navigates, form submits, history.pushState(), etc.
@@ -384,7 +417,7 @@ class AgentFocusChangedEvent(BaseEvent):
target_id: TargetID
url: str
- event_timeout: float | None = 10.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_AgentFocusChangedEvent', 10.0) # seconds
class TargetCrashedEvent(BaseEvent):
@@ -393,7 +426,7 @@ class TargetCrashedEvent(BaseEvent):
target_id: TargetID
error: str
- event_timeout: float | None = 10.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_TargetCrashedEvent', 10.0) # seconds
class NavigationStartedEvent(BaseEvent):
@@ -402,7 +435,7 @@ class NavigationStartedEvent(BaseEvent):
target_id: TargetID
url: str
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_NavigationStartedEvent', 30.0) # seconds
class NavigationCompleteEvent(BaseEvent):
@@ -414,7 +447,7 @@ class NavigationCompleteEvent(BaseEvent):
error_message: str | None = None # Error/timeout message if navigation had issues
loading_status: str | None = None # Detailed loading status (e.g., network timeout info)
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_NavigationCompleteEvent', 30.0) # seconds
# ============================================================================
@@ -429,7 +462,7 @@ class BrowserErrorEvent(BaseEvent):
message: str
details: dict[str, Any] = Field(default_factory=dict)
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_BrowserErrorEvent', 30.0) # seconds
# ============================================================================
@@ -442,7 +475,7 @@ class SaveStorageStateEvent(BaseEvent):
path: str | None = None # Optional path, uses profile default if not provided
- event_timeout: float | None = 45.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_SaveStorageStateEvent', 45.0) # seconds
class StorageStateSavedEvent(BaseEvent):
@@ -452,7 +485,7 @@ class StorageStateSavedEvent(BaseEvent):
cookies_count: int
origins_count: int
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_StorageStateSavedEvent', 30.0) # seconds
class LoadStorageStateEvent(BaseEvent):
@@ -460,7 +493,7 @@ class LoadStorageStateEvent(BaseEvent):
path: str | None = None # Optional path, uses profile default if not provided
- event_timeout: float | None = 45.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_LoadStorageStateEvent', 45.0) # seconds
# TODO: refactor this to:
@@ -474,7 +507,7 @@ class StorageStateLoadedEvent(BaseEvent):
cookies_count: int
origins_count: int
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_StorageStateLoadedEvent', 30.0) # seconds
# ============================================================================
@@ -494,7 +527,7 @@ class FileDownloadedEvent(BaseEvent):
from_cache: bool = False
auto_download: bool = False # Whether this was an automatic download (e.g., PDF auto-download)
- event_timeout: float | None = 30.0 # seconds
+ event_timeout: float | None = _get_timeout('TIMEOUT_FileDownloadedEvent', 30.0) # seconds
class AboutBlankDVDScreensaverShownEvent(BaseEvent):
@@ -510,7 +543,7 @@ class DialogOpenedEvent(BaseEvent):
dialog_type: str # 'alert', 'confirm', 'prompt', or 'beforeunload'
message: str
url: str
- frame_id: str
+ frame_id: str | None = None # Can be None when frameId is not provided by CDP
# target_id: TargetID # TODO: add this to avoid needing target_id_from_frame() later
diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py
index 10fffe122..7507d7203 100644
--- a/browser_use/browser/profile.py
+++ b/browser_use/browser/profile.py
@@ -10,7 +10,6 @@ from urllib.parse import urlparse
from pydantic import AfterValidator, AliasChoices, BaseModel, ConfigDict, Field, field_validator, model_validator
from browser_use.config import CONFIG
-from browser_use.observability import observe_debug
from browser_use.utils import _log_pretty_path, logger
CHROME_DEBUG_PORT = 9242 # use a non-default port to avoid conflicts with other tools / devs using 9222
@@ -616,6 +615,18 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
# save_har_path: alias of record_har_path
# trace_path: alias of traces_dir
+ # these shadow the old playwright args on BrowserContextArgs, but it's ok
+ # because we handle them ourselves in a watchdog and we no longer use playwright, so they should live in the scope for our own config in BrowserProfile long-term
+ record_video_dir: Path | None = Field(
+ default=None,
+ description='Directory to save video recordings. If set, a video of the session will be recorded.',
+ validation_alias=AliasChoices('save_recording_path', 'record_video_dir'),
+ )
+ record_video_size: ViewportSize | None = Field(
+ default=None, description='Video frame size. If not set, it will use the viewport size.'
+ )
+ record_video_framerate: int = Field(default=30, description='The framerate to use for the video recording.')
+
# TODO: finish implementing extension support in extensions.py
# extension_ids_to_preinstall: list[str] = Field(
# default_factory=list, description='List of Chrome extension IDs to preinstall.'
@@ -747,6 +758,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
if proxy_bypass:
pre_conversion_args.append(f'--proxy-bypass-list={proxy_bypass}')
+ # User agent flag
+ if self.user_agent:
+ pre_conversion_args.append(f'--user-agent={self.user_agent}')
+
# Special handling for --disable-features to merge values instead of overwriting
# This prevents disable_security=True from breaking extensions by ensuring
# both default features (including extension-related) and security features are preserved
@@ -776,6 +791,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
# convert to dict and back to dedupe and merge other duplicate args
final_args_list = BrowserLaunchArgs.args_as_list(BrowserLaunchArgs.args_as_dict(non_disable_features_args))
+
return final_args_list
def _get_extension_args(self) -> list[str]:
@@ -1016,7 +1032,6 @@ async function initialize(checkInitialized, magic) {{
os.unlink(temp_zip.name)
- @observe_debug(ignore_input=True, ignore_output=True, name='detect_display_configuration')
def detect_display_configuration(self) -> None:
"""
Detect the system display size and initialize the display-related config defaults:
@@ -1031,36 +1046,43 @@ async function initialize(checkInitialized, magic) {{
if self.headless is None:
self.headless = not has_screen_available
- # set up window size and position if headful
+ # Determine viewport behavior based on mode and user preferences
+ user_provided_viewport = self.viewport is not None
+
if self.headless:
- # headless mode: no window available, use viewport instead to constrain content size
+ # Headless mode: always use viewport for content size control
self.viewport = self.viewport or self.window_size or self.screen
- self.window_position = None # no windows to position in headless mode
+ self.window_position = None
self.window_size = None
- self.no_viewport = False # viewport is always enabled in headless mode
+ self.no_viewport = False
else:
- # headful mode: use window, disable viewport by default, content fits to size of window
+ # Headful mode: respect user's viewport preference
self.window_size = self.window_size or self.screen
- self.no_viewport = True if self.no_viewport is None else self.no_viewport
- self.viewport = None if self.no_viewport else self.viewport
- # automatically setup viewport if any config requires it
- use_viewport = self.headless or self.viewport or self.device_scale_factor
- self.no_viewport = not use_viewport if self.no_viewport is None else self.no_viewport
- use_viewport = not self.no_viewport
+ if user_provided_viewport:
+ # User explicitly set viewport - enable viewport mode
+ self.no_viewport = False
+ else:
+ # Default headful: content fits to window (no viewport)
+ self.no_viewport = True if self.no_viewport is None else self.no_viewport
- if use_viewport:
- # if we are using viewport, make device_scale_factor and screen are set to real values to avoid easy fingerprinting
+ # Handle special requirements (device_scale_factor forces viewport mode)
+ if self.device_scale_factor and self.no_viewport is None:
+ self.no_viewport = False
+
+ # Finalize configuration
+ if self.no_viewport:
+ # No viewport mode: content adapts to window
+ self.viewport = None
+ self.device_scale_factor = None
+ self.screen = None
+ assert self.viewport is None
+ assert self.no_viewport is True
+ else:
+ # Viewport mode: ensure viewport is set
self.viewport = self.viewport or self.screen
self.device_scale_factor = self.device_scale_factor or 1.0
assert self.viewport is not None
assert self.no_viewport is False
- else:
- # device_scale_factor and screen are not supported non-viewport mode, the system monitor determines these
- self.viewport = None
- self.device_scale_factor = None # only supported in viewport mode
- self.screen = None # only supported in viewport mode
- assert self.viewport is None
- assert self.no_viewport is True
assert not (self.headless and self.no_viewport), 'headless=True and no_viewport=True cannot both be set at the same time'
diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py
index 69de393c9..175d5085a 100644
--- a/browser_use/browser/python_highlights.py
+++ b/browser_use/browser/python_highlights.py
@@ -18,6 +18,57 @@ from browser_use.utils import time_execution_async
logger = logging.getLogger(__name__)
+# Font cache to prevent repeated font loading and reduce memory usage
+_FONT_CACHE: dict[tuple[str, int], ImageFont.FreeTypeFont | None] = {}
+
+# Cross-platform font paths
+_FONT_PATHS = [
+ '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', # Linux (Debian/Ubuntu)
+ '/usr/share/fonts/TTF/DejaVuSans-Bold.ttf', # Linux (Arch/Fedora)
+ '/System/Library/Fonts/Arial.ttf', # macOS
+ 'C:\\Windows\\Fonts\\arial.ttf', # Windows
+ 'arial.ttf', # Windows (system path)
+ 'Arial Bold.ttf', # macOS alternative
+ '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf', # Linux alternative
+]
+
+
+def get_cross_platform_font(font_size: int) -> ImageFont.FreeTypeFont | None:
+ """Get a cross-platform compatible font with caching to prevent memory leaks.
+
+ Args:
+ font_size: Size of the font to load
+
+ Returns:
+ ImageFont object or None if no system fonts are available
+ """
+ # Use cache key based on font size
+ cache_key = ('system_font', font_size)
+
+ # Return cached font if available
+ if cache_key in _FONT_CACHE:
+ return _FONT_CACHE[cache_key]
+
+ # Try to load a system font
+ font = None
+ for font_path in _FONT_PATHS:
+ try:
+ font = ImageFont.truetype(font_path, font_size)
+ break
+ except OSError:
+ continue
+
+ # Cache the result (even if None) to avoid repeated attempts
+ _FONT_CACHE[cache_key] = font
+ return font
+
+
+def cleanup_font_cache() -> None:
+ """Clean up the font cache to prevent memory leaks in long-running applications."""
+ global _FONT_CACHE
+ _FONT_CACHE.clear()
+
+
# Color scheme for different element types
ELEMENT_COLORS = {
'button': '#FF6B6B', # Red for buttons
@@ -102,18 +153,10 @@ def draw_enhanced_bounding_box_with_text(
css_width = img_width # / device_pixel_ratio
# Much smaller scaling - 1% of CSS viewport width, max 16px to prevent huge highlights
base_font_size = max(10, min(20, int(css_width * 0.01)))
- big_font = None
- try:
- big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', base_font_size)
- except OSError:
- try:
- big_font = ImageFont.truetype('arial.ttf', base_font_size)
- except OSError:
- # Try system fonts on different platforms
- try:
- big_font = ImageFont.truetype('Arial Bold.ttf', base_font_size)
- except OSError:
- big_font = font # Fallback to original font
+ # Use shared font loading function with caching
+ big_font = get_cross_platform_font(base_font_size)
+ if big_font is None:
+ big_font = font # Fallback to original font if no system fonts found
# Get text size with bigger font
if big_font:
@@ -391,15 +434,9 @@ async def create_highlighted_screenshot(
# Create drawing context
draw = ImageDraw.Draw(image)
- # Try to load a font, fall back to default if not available
- font = None
- try:
- font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12)
- except OSError:
- try:
- font = ImageFont.truetype('arial.ttf', 12)
- except OSError:
- font = None # Use default font
+ # Load font using shared function with caching
+ font = get_cross_platform_font(12)
+ # If no system fonts found, font remains None and will use default font
# Process elements sequentially to avoid ImageDraw thread safety issues
# PIL ImageDraw is not thread-safe, so we process elements one by one
@@ -408,16 +445,24 @@ async def create_highlighted_screenshot(
# Convert back to base64
output_buffer = io.BytesIO()
- image.save(output_buffer, format='PNG')
- output_buffer.seek(0)
+ try:
+ image.save(output_buffer, format='PNG')
+ output_buffer.seek(0)
+ highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
- highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
-
- logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements')
- return highlighted_b64
+ logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements')
+ return highlighted_b64
+ finally:
+ # Explicit cleanup to prevent memory leaks
+ output_buffer.close()
+ if 'image' in locals():
+ image.close()
except Exception as e:
logger.error(f'Failed to create highlighted screenshot: {e}')
+ # Clean up on error as well
+ if 'image' in locals():
+ image.close()
# Return original screenshot on error
return screenshot_b64
@@ -463,6 +508,7 @@ async def create_highlighted_screenshot_async(
screenshot_b64: Base64 encoded screenshot
selector_map: Map of interactive elements
cdp_session: CDP session for getting viewport info
+ filter_highlight_ids: Whether to filter element IDs based on meaningful text
Returns:
Base64 encoded highlighted screenshot
@@ -496,3 +542,7 @@ async def create_highlighted_screenshot_async(
await asyncio.to_thread(_write_screenshot)
return final_screenshot
+
+
+# Export the cleanup function for external use in long-running applications
+__all__ = ['create_highlighted_screenshot', 'create_highlighted_screenshot_async', 'cleanup_font_cache']
diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
index 5f4c430be..70c7235e7 100644
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -44,9 +44,6 @@ from browser_use.utils import _log_pretty_url, is_new_tab_page
DEFAULT_BROWSER_PROFILE = BrowserProfile()
-MAX_SCREENSHOT_HEIGHT = 2000
-MAX_SCREENSHOT_WIDTH = 1920
-
_LOGGED_UNIQUE_SESSION_IDS = set() # track unique session IDs that have been logged to make sure we always assign a unique enough id to new sessions and avoid ambiguity in logs
red = '\033[91m'
reset = '\033[0m'
@@ -247,6 +244,8 @@ class BrowserSession(BaseModel):
record_har_mode: str | None = None,
record_har_path: str | Path | None = None,
record_video_dir: str | Path | None = None,
+ record_video_framerate: int | None = None,
+ record_video_size: dict | None = None,
# From BrowserLaunchPersistentContextArgs
user_data_dir: str | Path | None = None,
# From BrowserNewContextArgs
@@ -338,6 +337,7 @@ class BrowserSession(BaseModel):
_dom_watchdog: Any | None = PrivateAttr(default=None)
_screenshot_watchdog: Any | None = PrivateAttr(default=None)
_permissions_watchdog: Any | None = PrivateAttr(default=None)
+ _recording_watchdog: Any | None = PrivateAttr(default=None)
_logger: Any = PrivateAttr(default=None)
@@ -404,6 +404,7 @@ class BrowserSession(BaseModel):
self._dom_watchdog = None
self._screenshot_watchdog = None
self._permissions_watchdog = None
+ self._recording_watchdog = None
def model_post_init(self, __context) -> None:
"""Register event handlers after model initialization."""
@@ -425,6 +426,7 @@ class BrowserSession(BaseModel):
BaseWatchdog.attach_handler_to_session(self, BrowserStopEvent, self.on_BrowserStopEvent)
BaseWatchdog.attach_handler_to_session(self, NavigateToUrlEvent, self.on_NavigateToUrlEvent)
BaseWatchdog.attach_handler_to_session(self, SwitchTabEvent, self.on_SwitchTabEvent)
+ BaseWatchdog.attach_handler_to_session(self, TabCreatedEvent, self.on_TabCreatedEvent)
BaseWatchdog.attach_handler_to_session(self, TabClosedEvent, self.on_TabClosedEvent)
BaseWatchdog.attach_handler_to_session(self, AgentFocusChangedEvent, self.on_AgentFocusChangedEvent)
BaseWatchdog.attach_handler_to_session(self, FileDownloadedEvent, self.on_FileDownloadedEvent)
@@ -707,6 +709,22 @@ class BrowserSession(BaseModel):
await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id))
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id})
+ async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
+ """Handle tab creation - apply viewport settings to new tab."""
+ # Apply viewport settings if configured
+ if self.browser_profile.viewport and not self.browser_profile.no_viewport:
+ try:
+ viewport_width = self.browser_profile.viewport.width
+ viewport_height = self.browser_profile.viewport.height
+ device_scale_factor = self.browser_profile.device_scale_factor or 1.0
+
+ # Use the helper method with the new tab's target_id
+ await self._cdp_set_viewport(viewport_width, viewport_height, device_scale_factor, target_id=event.target_id)
+
+ self.logger.debug(f'Applied viewport {viewport_width}x{viewport_height} to tab {event.target_id[-8:]}')
+ except Exception as e:
+ self.logger.warning(f'Failed to set viewport for new tab {event.target_id[-8:]}: {e}')
+
async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
"""Handle tab closure - update focus if needed."""
if not self.agent_focus:
@@ -955,9 +973,10 @@ class BrowserSession(BaseModel):
from browser_use.browser.watchdogs.local_browser_watchdog import LocalBrowserWatchdog
from browser_use.browser.watchdogs.permissions_watchdog import PermissionsWatchdog
from browser_use.browser.watchdogs.popups_watchdog import PopupsWatchdog
+ from browser_use.browser.watchdogs.recording_watchdog import RecordingWatchdog
from browser_use.browser.watchdogs.screenshot_watchdog import ScreenshotWatchdog
from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
- # from browser_use.browser.storage_state_watchdog import StorageStateWatchdog
+ from browser_use.browser.watchdogs.storage_state_watchdog import StorageStateWatchdog
# Initialize CrashWatchdog
# CrashWatchdog.model_rebuild()
@@ -978,14 +997,27 @@ class BrowserSession(BaseModel):
if self.browser_profile.auto_download_pdfs:
self.logger.debug('📄 PDF auto-download enabled for this session')
- # # Initialize StorageStateWatchdog
- # StorageStateWatchdog.model_rebuild()
- # self._storage_state_watchdog = StorageStateWatchdog(event_bus=self.event_bus, browser_session=self)
- # # self.event_bus.on(BrowserConnectedEvent, self._storage_state_watchdog.on_BrowserConnectedEvent)
- # # self.event_bus.on(BrowserStopEvent, self._storage_state_watchdog.on_BrowserStopEvent)
- # # self.event_bus.on(SaveStorageStateEvent, self._storage_state_watchdog.on_SaveStorageStateEvent)
- # # self.event_bus.on(LoadStorageStateEvent, self._storage_state_watchdog.on_LoadStorageStateEvent)
- # self._storage_state_watchdog.attach_to_session()
+ # Initialize StorageStateWatchdog conditionally
+ # Enable when user provides either storage_state or user_data_dir (indicating they want persistence)
+ should_enable_storage_state = (
+ self.browser_profile.storage_state is not None or self.browser_profile.user_data_dir is not None
+ )
+
+ if should_enable_storage_state:
+ StorageStateWatchdog.model_rebuild()
+ self._storage_state_watchdog = StorageStateWatchdog(
+ event_bus=self.event_bus,
+ browser_session=self,
+ # More conservative defaults when auto-enabled
+ auto_save_interval=60.0, # 1 minute instead of 30 seconds
+ save_on_change=False, # Only save on shutdown by default
+ )
+ self._storage_state_watchdog.attach_to_session()
+ self.logger.debug(
+ f'🍪 StorageStateWatchdog enabled (storage_state: {bool(self.browser_profile.storage_state)}, user_data_dir: {bool(self.browser_profile.user_data_dir)})'
+ )
+ else:
+ self.logger.debug('🍪 StorageStateWatchdog disabled (no storage_state or user_data_dir configured)')
# Initialize LocalBrowserWatchdog
LocalBrowserWatchdog.model_rebuild()
@@ -1054,6 +1086,11 @@ class BrowserSession(BaseModel):
# self.event_bus.on(BrowserStateRequestEvent, self._dom_watchdog.on_BrowserStateRequestEvent)
self._dom_watchdog.attach_to_session()
+ # Initialize RecordingWatchdog (handles video recording)
+ RecordingWatchdog.model_rebuild()
+ self._recording_watchdog = RecordingWatchdog(event_bus=self.event_bus, browser_session=self)
+ self._recording_watchdog.attach_to_session()
+
# Mark watchdogs as attached to prevent duplicate attachment
self._watchdogs_attached = True
@@ -1631,7 +1668,7 @@ class BrowserSession(BaseModel):
"""Get list of files downloaded during this browser session.
Returns:
- list[str]: List of absolute file paths to downloaded files in this session
+ list[str]: List of absolute file paths to downloaded files in this session
"""
return self._downloaded_files.copy()
@@ -1758,22 +1795,119 @@ class BrowserSession(BaseModel):
params={'identifier': identifier}, session_id=cdp_session.session_id
)
- async def _cdp_set_viewport(self, width: int, height: int, device_scale_factor: float = 1.0, mobile: bool = False) -> None:
- """Set viewport using CDP Emulation.setDeviceMetricsOverride."""
- await self.cdp_client.send.Emulation.setDeviceMetricsOverride(
- params={'width': width, 'height': height, 'deviceScaleFactor': device_scale_factor, 'mobile': mobile}
+ async def _cdp_set_viewport(
+ self, width: int, height: int, device_scale_factor: float = 1.0, mobile: bool = False, target_id: str | None = None
+ ) -> None:
+ """Set viewport using CDP Emulation.setDeviceMetricsOverride.
+
+ Args:
+ width: Viewport width
+ height: Viewport height
+ device_scale_factor: Device scale factor (default 1.0)
+ mobile: Whether to emulate mobile device (default False)
+ target_id: Optional target ID to set viewport for. If not provided, uses agent_focus.
+ """
+ if target_id:
+ # Set viewport for specific target
+ cdp_session = await self.get_or_create_cdp_session(target_id, focus=False, new_socket=False)
+ elif self.agent_focus:
+ # Use current focus
+ cdp_session = self.agent_focus
+ else:
+ self.logger.warning('Cannot set viewport: no target_id provided and agent_focus not initialized')
+ return
+
+ await cdp_session.cdp_client.send.Emulation.setDeviceMetricsOverride(
+ params={'width': width, 'height': height, 'deviceScaleFactor': device_scale_factor, 'mobile': mobile},
+ session_id=cdp_session.session_id,
)
+ async def _cdp_get_origins(self) -> list[dict[str, Any]]:
+ """Get origins with localStorage and sessionStorage using CDP."""
+ origins = []
+ cdp_session = await self.get_or_create_cdp_session(target_id=None, new_socket=False)
+
+ try:
+ # Enable DOMStorage domain to track storage
+ await cdp_session.cdp_client.send.DOMStorage.enable(session_id=cdp_session.session_id)
+
+ try:
+ # Get all frames to find unique origins
+ frames_result = await cdp_session.cdp_client.send.Page.getFrameTree(session_id=cdp_session.session_id)
+
+ # Extract unique origins from frames
+ unique_origins = set()
+
+ def _extract_origins(frame_tree):
+ """Recursively extract origins from frame tree."""
+ frame = frame_tree.get('frame', {})
+ origin = frame.get('securityOrigin')
+ if origin and origin != 'null':
+ unique_origins.add(origin)
+
+ # Process child frames
+ for child in frame_tree.get('childFrames', []):
+ _extract_origins(child)
+
+ async def _get_storage_items(origin: str, is_local_storage: bool) -> list[dict[str, str]] | None:
+ """Helper to get storage items for an origin."""
+ storage_type = 'localStorage' if is_local_storage else 'sessionStorage'
+ try:
+ result = await cdp_session.cdp_client.send.DOMStorage.getDOMStorageItems(
+ params={'storageId': {'securityOrigin': origin, 'isLocalStorage': is_local_storage}},
+ session_id=cdp_session.session_id,
+ )
+
+ items = []
+ for item in result.get('entries', []):
+ if len(item) == 2: # Each item is [key, value]
+ items.append({'name': item[0], 'value': item[1]})
+
+ return items if items else None
+ except Exception as e:
+ self.logger.debug(f'Failed to get {storage_type} for {origin}: {e}')
+ return None
+
+ _extract_origins(frames_result.get('frameTree', {}))
+
+ # For each unique origin, get localStorage and sessionStorage
+ for origin in unique_origins:
+ origin_data = {'origin': origin}
+
+ # Get localStorage
+ local_storage = await _get_storage_items(origin, is_local_storage=True)
+ if local_storage:
+ origin_data['localStorage'] = local_storage
+
+ # Get sessionStorage
+ session_storage = await _get_storage_items(origin, is_local_storage=False)
+ if session_storage:
+ origin_data['sessionStorage'] = session_storage
+
+ # Only add origin if it has storage data
+ if 'localStorage' in origin_data or 'sessionStorage' in origin_data:
+ origins.append(origin_data)
+
+ finally:
+ # Always disable DOMStorage tracking when done
+ await cdp_session.cdp_client.send.DOMStorage.disable(session_id=cdp_session.session_id)
+
+ except Exception as e:
+ self.logger.warning(f'Failed to get origins: {e}')
+
+ return origins
+
async def _cdp_get_storage_state(self) -> dict:
"""Get storage state (cookies, localStorage, sessionStorage) using CDP."""
# Use the _cdp_get_cookies helper which handles session attachment
cookies = await self._cdp_get_cookies()
- # Get localStorage and sessionStorage would require evaluating JavaScript
- # on each origin, which is more complex. For now, return cookies only.
+ # Get origins with localStorage/sessionStorage
+ origins = await self._cdp_get_origins()
+
return {
'cookies': cookies,
- 'origins': [], # Would need to iterate through origins for localStorage/sessionStorage
+ 'origins': origins,
}
async def _cdp_navigate(self, url: str, target_id: TargetID | None = None) -> None:
diff --git a/browser_use/browser/video_recorder.py b/browser_use/browser/video_recorder.py
new file mode 100644
index 000000000..4bd5b0af7
--- /dev/null
+++ b/browser_use/browser/video_recorder.py
@@ -0,0 +1,162 @@
+"""Video Recording Service for Browser Use Sessions."""
+
+import base64
+import logging
+import math
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from browser_use.browser.profile import ViewportSize
+
+try:
+ import imageio.v2 as iio
+ import imageio_ffmpeg
+ import numpy as np
+ from imageio.core.format import Format
+
+ IMAGEIO_AVAILABLE = True
+except ImportError:
+ IMAGEIO_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+def _get_padded_size(size: ViewportSize, macro_block_size: int = 16) -> ViewportSize:
+ """Calculates the dimensions padded to the nearest multiple of macro_block_size."""
+ width = int(math.ceil(size['width'] / macro_block_size)) * macro_block_size
+ height = int(math.ceil(size['height'] / macro_block_size)) * macro_block_size
+ return ViewportSize(width=width, height=height)
+
+
+class VideoRecorderService:
+ """
+ Handles the video encoding process for a browser session using imageio.
+
+ This service captures individual frames from the CDP screencast, decodes them,
+ and appends them to a video file using a pip-installable ffmpeg backend.
+ It automatically resizes frames to match the target video dimensions.
+ """
+
+ def __init__(self, output_path: Path, size: ViewportSize, framerate: int):
+ """
+ Initializes the video recorder.
+
+ Args:
+ output_path: The full path where the video will be saved.
+ size: A ViewportSize object specifying the width and height of the video.
+ framerate: The desired framerate for the output video.
+ """
+ self.output_path = output_path
+ self.size = size
+ self.framerate = framerate
+ self._writer: Optional['Format.Writer'] = None
+ self._is_active = False
+ self.padded_size = _get_padded_size(self.size)
+
+ def start(self) -> None:
+ """
+ Prepares and starts the video writer.
+
+ If the required optional dependencies are not installed, this method will
+ log an error and do nothing.
+ """
+ if not IMAGEIO_AVAILABLE:
+ logger.error(
+ 'MP4 recording requires optional dependencies. Please install them with: pip install "browser-use[video]"'
+ )
+ return
+
+ try:
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
+ # The macro_block_size is set to None because we handle padding ourselves
+ self._writer = iio.get_writer(
+ str(self.output_path),
+ fps=self.framerate,
+ codec='libx264',
+ quality=8, # A good balance of quality and file size (1-10 scale)
+ pixelformat='yuv420p', # Ensures compatibility with most players
+ macro_block_size=None,
+ )
+ self._is_active = True
+ logger.debug(f'Video recorder started. Output will be saved to {self.output_path}')
+ except Exception as e:
+ logger.error(f'Failed to initialize video writer: {e}')
+ self._is_active = False
+
+ def add_frame(self, frame_data_b64: str) -> None:
+ """
+ Decodes a base64-encoded PNG frame, resizes it, pads it to be codec-compatible,
+ and appends it to the video.
+
+ Args:
+ frame_data_b64: A base64-encoded string of the PNG frame data.
+ """
+ if not self._is_active or not self._writer:
+ return
+
+ try:
+ frame_bytes = base64.b64decode(frame_data_b64)
+
+ # Build a filter chain for ffmpeg:
+ # 1. scale: Resizes the frame to the user-specified dimensions.
+ # 2. pad: Adds black bars to meet codec's macro-block requirements,
+ # centering the original content.
+ vf_chain = (
+ f'scale={self.size["width"]}:{self.size["height"]},'
+ f'pad={self.padded_size["width"]}:{self.padded_size["height"]}:(ow-iw)/2:(oh-ih)/2:color=black'
+ )
+
+ output_pix_fmt = 'rgb24'
+ command = [
+ imageio_ffmpeg.get_ffmpeg_exe(),
+ '-f',
+ 'image2pipe', # Input format from a pipe
+ '-c:v',
+ 'png', # Specify input codec is PNG
+ '-i',
+ '-', # Input from stdin
+ '-vf',
+ vf_chain, # Video filter for resizing and padding
+ '-f',
+ 'rawvideo', # Output format is raw video
+ '-pix_fmt',
+ output_pix_fmt, # Output pixel format
+ '-', # Output to stdout
+ ]
+
+ # Execute ffmpeg as a subprocess
+ proc = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out, err = proc.communicate(input=frame_bytes)
+
+ if proc.returncode != 0:
+ err_msg = err.decode(errors='ignore').strip()
+ if 'deprecated pixel format used' not in err_msg.lower():
+ raise OSError(f'ffmpeg error during resizing/padding: {err_msg}')
+ else:
+ logger.debug(f'ffmpeg warning during resizing/padding: {err_msg}')
+
+ # Convert the raw output bytes to a numpy array with the padded dimensions
+ img_array = np.frombuffer(out, dtype=np.uint8).reshape((self.padded_size['height'], self.padded_size['width'], 3))
+
+ self._writer.append_data(img_array)
+ except Exception as e:
+ logger.warning(f'Could not process and add video frame: {e}')
+
+ def stop_and_save(self) -> None:
+ """
+ Finalizes the video file by closing the writer.
+
+ This method should be called when the recording session is complete.
+ """
+ if not self._is_active or not self._writer:
+ return
+
+ try:
+ self._writer.close()
+ logger.info(f'📹 Video recording saved successfully to: {self.output_path}')
+ except Exception as e:
+ logger.error(f'Failed to finalize and save video: {e}')
+ finally:
+ self._is_active = False
+ self._writer = None
diff --git a/browser_use/browser/watchdogs/popups_watchdog.py b/browser_use/browser/watchdogs/popups_watchdog.py
index f517838a2..32dc6419d 100644
--- a/browser_use/browser/watchdogs/popups_watchdog.py
+++ b/browser_use/browser/watchdogs/popups_watchdog.py
@@ -6,16 +6,16 @@ from typing import ClassVar
from bubus import BaseEvent
from pydantic import PrivateAttr
-from browser_use.browser.events import DialogOpenedEvent, TabCreatedEvent
+from browser_use.browser.events import TabCreatedEvent
from browser_use.browser.watchdog_base import BaseWatchdog
class PopupsWatchdog(BaseWatchdog):
- """Handles JavaScript dialogs (alert, confirm, prompt) by automatically accepting them."""
+ """Handles JavaScript dialogs (alert, confirm, prompt) by automatically accepting them immediately."""
# Events this watchdog listens to and emits
- LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [TabCreatedEvent, DialogOpenedEvent]
- EMITS: ClassVar[list[type[BaseEvent]]] = [DialogOpenedEvent]
+ LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [TabCreatedEvent]
+ EMITS: ClassVar[list[type[BaseEvent]]] = []
# Track which targets have dialog handlers registered
_dialog_listeners_registered: set[str] = PrivateAttr(default_factory=set)
@@ -36,107 +36,77 @@ class PopupsWatchdog(BaseWatchdog):
self.logger.debug(f'📌 Starting dialog handler setup for target {target_id}')
try:
+ # Get all CDP sessions for this target and any child frames
cdp_session = await self.browser_session.get_or_create_cdp_session(
target_id, focus=False
) # don't auto-focus new tabs! sometimes we need to open tabs in background
- # Set up async handler for JavaScript dialogs - now we can handle them immediately!
+ # Also register for the root CDP client to catch dialogs from any frame
+ if self.browser_session._cdp_client_root:
+ self.logger.debug('📌 Also registering handler on root CDP client')
+
+ # Set up async handler for JavaScript dialogs - accept immediately without event dispatch
async def handle_dialog(event_data, session_id: str | None = None):
- """Handle JavaScript dialog events - accept immediately and dispatch event."""
- self.logger.debug(f'🚨 DIALOG EVENT RECEIVED: {event_data}, session_id={session_id}')
-
- dialog_type = event_data.get('type', 'alert')
- message = event_data.get('message', '')
- url = event_data.get('url')
- frame_id = event_data.get('frameId')
-
- self.logger.debug(f"🔔 JavaScript {dialog_type} dialog detected: '{message[:50]}...' - accepting immediately")
-
- # Dispatch the event first so tests can observe it
- event = self.browser_session.event_bus.dispatch(
- DialogOpenedEvent(
- frame_id=frame_id,
- dialog_type=dialog_type,
- message=message,
- url=url,
- )
- )
- await event.event_result(raise_if_none=False, raise_if_any=True, timeout=5.0)
-
- # Accept the dialog immediately to unblock the browser
+ """Handle JavaScript dialog events - accept immediately."""
try:
- if self.browser_session._cdp_client_root and session_id:
- self.logger.debug('🔄 Sending handleJavaScriptDialog command')
- await self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
- params={'accept': True},
- session_id=session_id,
- )
- self.logger.info('✅ Dialog accepted successfully')
- else:
- self.logger.error('Cannot accept dialog - CDP client or session not available')
- except Exception as e:
- self.logger.error(f'Failed to accept dialog: {e}')
+ dialog_type = event_data.get('type', 'alert')
+ message = event_data.get('message', '')
+ self.logger.info(f"🔔 JavaScript {dialog_type} dialog: '{message[:100]}' - attempting to accept...")
+
+ self.logger.debug('Trying all approaches to accept dialog...')
+
+ # Approach 1: Use the session that detected the dialog
+ if self.browser_session._cdp_client_root and session_id:
+ try:
+ self.logger.debug(f'🔄 Approach 1: Using session {session_id}')
+ await asyncio.wait_for(
+ self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
+ params={'accept': True},
+ session_id=session_id,
+ ),
+ timeout=0.25,
+ )
+ except (TimeoutError, Exception) as e:
+ pass
+
+ # Approach 2: Try with current agent focus session
+ if self.browser_session._cdp_client_root and self.browser_session.agent_focus:
+ try:
+ self.logger.debug(
+ f'🔄 Approach 2: Using agent focus session {self.browser_session.agent_focus.session_id}'
+ )
+ await asyncio.wait_for(
+ self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
+ params={'accept': True},
+ session_id=self.browser_session.agent_focus.session_id,
+ ),
+ timeout=0.25,
+ )
+ except (TimeoutError, Exception) as e:
+ pass
+
+ except Exception as e:
+ self.logger.error(f'❌ Critical error in dialog handler: {type(e).__name__}: {e}')
+
+ # Register handler on the specific session
cdp_session.cdp_client.register.Page.javascriptDialogOpening(handle_dialog) # type: ignore[arg-type]
self.logger.debug(
f'Successfully registered Page.javascriptDialogOpening handler for session {cdp_session.session_id}'
)
+ # Also register on root CDP client to catch dialogs from any frame
+ if hasattr(self.browser_session._cdp_client_root, 'register'):
+ try:
+ self.browser_session._cdp_client_root.register.Page.javascriptDialogOpening(handle_dialog) # type: ignore[arg-type]
+ self.logger.debug('Successfully registered dialog handler on root CDP client for all frames')
+ except Exception as root_error:
+ self.logger.warning(f'Failed to register on root CDP client: {root_error}')
+
# Mark this target as having dialog handling set up
self._dialog_listeners_registered.add(target_id)
self.logger.debug(f'Set up JavaScript dialog handling for tab {target_id}')
except Exception as e:
- self.logger.warning(f'Failed to set up dialog handling for tab {target_id}: {e}')
-
- async def on_DialogOpenedEvent(self, event: DialogOpenedEvent) -> None:
- """Handle the async closing of JavaScript dialogs."""
- self.logger.debug(
- f'📋 on_DialogOpenedEvent called with frame_id={event.frame_id} url={event.url} message={event.message}'
- )
-
- assert self.browser_session.agent_focus is not None, 'Agent focus not set when handling DialogOpenedEvent'
-
- current_focus_url = self.browser_session.agent_focus.url
- current_focus_target_id = self.browser_session.agent_focus.target_id
-
- cdp_session = await asyncio.wait_for(self.browser_session.cdp_client_for_frame(event.frame_id), timeout=5.0)
- try:
- # delay to look more human before auto-closing, some popular antibot fingerprint tests check for modals closing too fast
- await asyncio.sleep(0.25)
- assert self.browser_session._cdp_client_root
- # self.browser_session._cdp_client_root.register.Page.javascriptDialogClosed(lambda *args: None)
- await asyncio.wait_for(
- self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
- params={'accept': True},
- session_id=cdp_session.session_id,
- ),
- timeout=5.0,
- )
- # CRITICAL: you must re-focus (Target.activateTarget()) after handling the dialog, otherwise the browser will crash ~5 seconds later
- await self.browser_session.get_or_create_cdp_session(target_id=current_focus_target_id, focus=True)
- self.logger.info('✅ JS dialog popup handled successfully')
-
- # graveyard of past attempts:
- # # new_target = await self.browser_session._cdp_client_root.send.Target.createTarget(params={'url': current_focus_url})
- # # self.browser_session.agent_focus = await self.browser_session.get_or_create_cdp_session(target_id=new_target.get('targetId'), new_socket=True, focus=True)
- # # raise NotImplementedError('TODO: figure out why this requires a hard refresh and new socket to avoid crashing the entire browser on JS dialogs')
- # await asyncio.sleep(0.2)
- # await asyncio.wait_for(
- # self.browser_session._cdp_client_root.send.Runtime.evaluate(
- # params={'expression': '1'},
- # session_id=cdp_session.session_id,
- # ),
- # timeout=5.0,
- # )
- # # self.browser_session.agent_focus = await self.browser_session.get_or_create_cdp_session(current_focus.target_id, focus=True, new_socket=True)
- # # assert await self.browser_session.agent_focus.cdp_client.send.Page.getFrameTree(session_id=self.browser_session.agent_focus.session_id) is not None, "Agent focus not set after handling dialog"
- except Exception as e:
- self.logger.error(f'Failed to handle JavaScript dialog gracefully: {e}')
- # raise
- # finally:
- # self.event_bus.dispatch(AgentFocusChangedEvent(
- # target_id=current_focus_target_id,
- # url=self.browser_session.agent_focus.url,
- # ))
+ self.logger.warning(f'Failed to set up popup handling for tab {target_id}: {e}')
diff --git a/browser_use/browser/watchdogs/recording_watchdog.py b/browser_use/browser/watchdogs/recording_watchdog.py
new file mode 100644
index 000000000..02af46977
--- /dev/null
+++ b/browser_use/browser/watchdogs/recording_watchdog.py
@@ -0,0 +1,126 @@
+"""Recording Watchdog for Browser Use Sessions."""
+
+import asyncio
+from pathlib import Path
+from typing import ClassVar
+
+from bubus import BaseEvent
+from cdp_use.cdp.page.events import ScreencastFrameEvent
+from uuid_extensions import uuid7str
+
+from browser_use.browser.events import BrowserConnectedEvent, BrowserStopEvent
+from browser_use.browser.profile import ViewportSize
+from browser_use.browser.video_recorder import VideoRecorderService
+from browser_use.browser.watchdog_base import BaseWatchdog
+
+
+class RecordingWatchdog(BaseWatchdog):
+ """
+ Manages video recording of a browser session using CDP screencasting.
+ """
+
+ LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [BrowserConnectedEvent, BrowserStopEvent]
+ EMITS: ClassVar[list[type[BaseEvent]]] = []
+
+ _recorder: VideoRecorderService | None = None
+
+ async def on_BrowserConnectedEvent(self, event: BrowserConnectedEvent) -> None:
+ """
+ Starts video recording if it is configured in the browser profile.
+ """
+ profile = self.browser_session.browser_profile
+ if not profile.record_video_dir:
+ return
+
+ # Dynamically determine video size
+ size = profile.record_video_size
+ if not size:
+ self.logger.debug('record_video_size not specified, detecting viewport size...')
+ size = await self._get_current_viewport_size()
+
+ if not size:
+ self.logger.warning('Cannot start video recording: viewport size could not be determined.')
+ return
+
+ video_format = getattr(profile, 'record_video_format', 'mp4').strip('.')
+ output_path = Path(profile.record_video_dir) / f'{uuid7str()}.{video_format}'
+
+ self.logger.debug(f'Initializing video recorder for format: {video_format}')
+ self._recorder = VideoRecorderService(output_path=output_path, size=size, framerate=profile.record_video_framerate)
+ self._recorder.start()
+
+ if not self._recorder._is_active:
+ self._recorder = None
+ return
+
+ self.browser_session.cdp_client.register.Page.screencastFrame(self.on_screencastFrame)
+
+ try:
+ cdp_session = await self.browser_session.get_or_create_cdp_session()
+ await cdp_session.cdp_client.send.Page.startScreencast(
+ params={
+ 'format': 'png',
+ 'quality': 90,
+ 'maxWidth': size['width'],
+ 'maxHeight': size['height'],
+ 'everyNthFrame': 1,
+ },
+ session_id=cdp_session.session_id,
+ )
+ self.logger.info(f'📹 Started video recording to {output_path}')
+ except Exception as e:
+ self.logger.error(f'Failed to start screencast via CDP: {e}')
+ if self._recorder:
+ self._recorder.stop_and_save()
+ self._recorder = None
+
+ async def _get_current_viewport_size(self) -> ViewportSize | None:
+ """Gets the current viewport size directly from the browser via CDP."""
+ try:
+ cdp_session = await self.browser_session.get_or_create_cdp_session()
+ metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
+
+ # Use cssVisualViewport for the most accurate representation of the visible area
+ viewport = metrics.get('cssVisualViewport', {})
+ width = viewport.get('clientWidth')
+ height = viewport.get('clientHeight')
+
+ if width and height:
+ self.logger.debug(f'Detected viewport size: {width}x{height}')
+ return ViewportSize(width=int(width), height=int(height))
+ except Exception as e:
+ self.logger.warning(f'Failed to get viewport size from browser: {e}')
+
+ return None
+
+ def on_screencastFrame(self, event: ScreencastFrameEvent, session_id: str | None) -> None:
+ """
+ Synchronous handler for incoming screencast frames.
+ """
+ if not self._recorder:
+ return
+ self._recorder.add_frame(event['data'])
+ asyncio.create_task(self._ack_screencast_frame(event, session_id))
+
+ async def _ack_screencast_frame(self, event: ScreencastFrameEvent, session_id: str | None) -> None:
+ """
+ Asynchronously acknowledges a screencast frame.
+ """
+ try:
+ await self.browser_session.cdp_client.send.Page.screencastFrameAck(
+ params={'sessionId': event['sessionId']}, session_id=session_id
+ )
+ except Exception as e:
+ self.logger.debug(f'Failed to acknowledge screencast frame: {e}')
+
+ async def on_BrowserStopEvent(self, event: BrowserStopEvent) -> None:
+ """
+ Stops the video recording and finalizes the video file.
+ """
+ if self._recorder:
+ recorder = self._recorder
+ self._recorder = None
+
+ self.logger.debug('Stopping video recording and saving file...')
+ loop = asyncio.get_event_loop()
+ await loop.run_in_executor(None, recorder.stop_and_save)
diff --git a/browser_use/browser/watchdogs/security_watchdog.py b/browser_use/browser/watchdogs/security_watchdog.py
index 8e5096867..68941783f 100644
--- a/browser_use/browser/watchdogs/security_watchdog.py
+++ b/browser_use/browser/watchdogs/security_watchdog.py
@@ -156,11 +156,14 @@ class SecurityWatchdog(BaseWatchdog):
return True
else:
# Use fnmatch for other glob patterns
- if fnmatch.fnmatch(host, pattern):
+ if fnmatch.fnmatch(
+ full_url_pattern if '://' in pattern else host,
+ pattern,
+ ):
return True
else:
# Exact match
- if pattern.startswith(('http://', 'https://', 'chrome://', 'brave://', 'file://')):
+ if '://' in pattern:
# Full URL pattern
if url.startswith(pattern):
return True
diff --git a/browser_use/browser/watchdogs/storage_state_watchdog.py b/browser_use/browser/watchdogs/storage_state_watchdog.py
index 326520721..0b38e1283 100644
--- a/browser_use/browser/watchdogs/storage_state_watchdog.py
+++ b/browser_use/browser/watchdogs/storage_state_watchdog.py
@@ -12,6 +12,7 @@ from pydantic import Field, PrivateAttr
from browser_use.browser.events import (
BrowserConnectedEvent,
+ BrowserStopEvent,
LoadStorageStateEvent,
SaveStorageStateEvent,
StorageStateLoadedEvent,
@@ -26,6 +27,7 @@ class StorageStateWatchdog(BaseWatchdog):
# Event contracts
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [
BrowserConnectedEvent,
+ BrowserStopEvent,
SaveStorageStateEvent,
LoadStorageStateEvent,
]
@@ -51,7 +53,12 @@ class StorageStateWatchdog(BaseWatchdog):
await self._start_monitoring()
# Automatically load storage state after browser start
- self.event_bus.dispatch(LoadStorageStateEvent())
+ await self.event_bus.dispatch(LoadStorageStateEvent())
+
+ async def on_BrowserStopEvent(self, event: BrowserStopEvent) -> None:
+ """Stop monitoring when browser stops."""
+ self.logger.debug('[StorageStateWatchdog] Stopping storage_state monitoring')
+ await self._stop_monitoring()
async def on_SaveStorageStateEvent(self, event: SaveStorageStateEvent) -> None:
"""Handle storage state save request."""
diff --git a/browser_use/config.py b/browser_use/config.py
index 4114ab93b..e2a3b4194 100644
--- a/browser_use/config.py
+++ b/browser_use/config.py
@@ -159,6 +159,10 @@ class OldConfig:
def SKIP_LLM_API_KEY_VERIFICATION(self) -> bool:
return os.getenv('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[:1] in 'ty1'
+ @property
+ def DEFAULT_LLM(self) -> str:
+ return os.getenv('DEFAULT_LLM', '')
+
# Runtime hints
@property
def IN_DOCKER(self) -> bool:
@@ -203,6 +207,7 @@ class FlatEnvConfig(BaseSettings):
AZURE_OPENAI_ENDPOINT: str = Field(default='')
AZURE_OPENAI_KEY: str = Field(default='')
SKIP_LLM_API_KEY_VERIFICATION: bool = Field(default=False)
+ DEFAULT_LLM: str = Field(default='')
# Runtime hints
IN_DOCKER: bool | None = Field(default=None)
diff --git a/browser_use/dom/enhanced_snapshot.py b/browser_use/dom/enhanced_snapshot.py
index 889ed992f..6c08c637a 100644
--- a/browser_use/dom/enhanced_snapshot.py
+++ b/browser_use/dom/enhanced_snapshot.py
@@ -16,32 +16,16 @@ from browser_use.dom.views import DOMRect, EnhancedSnapshotNode
# Only the ESSENTIAL computed styles for interactivity and visibility detection
REQUIRED_COMPUTED_STYLES = [
- # Essential for visibility
- 'display',
- 'visibility',
- 'opacity',
- 'position',
- 'z-index',
- 'pointer-events',
- 'cursor',
- 'overflow',
- 'overflow-x',
- 'overflow-y',
- 'width',
- 'height',
- 'top',
- 'left',
- 'right',
- 'bottom',
- 'transform',
- 'clip',
- 'clip-path',
- 'user-select',
- 'background-color',
- 'color',
- 'border',
- 'margin',
- 'padding',
+ # Only styles actually accessed in the codebase (prevents Chrome crashes on heavy sites)
+ 'display', # Used in service.py visibility detection
+ 'visibility', # Used in service.py visibility detection
+ 'opacity', # Used in service.py visibility detection
+ 'overflow', # Used in views.py scrollability detection
+ 'overflow-x', # Used in views.py scrollability detection
+ 'overflow-y', # Used in views.py scrollability detection
+ 'cursor', # Used in enhanced_snapshot.py cursor extraction
+ 'pointer-events', # Used for clickability logic
+ 'position', # Used for visibility logic
]
@@ -81,6 +65,14 @@ def build_snapshot_lookup(
for i, backend_node_id in enumerate(nodes['backendNodeId']):
backend_node_to_snapshot_index[backend_node_id] = i
+ # PERFORMANCE: Pre-build layout index map to eliminate O(n²) double lookups
+ # Preserve original behavior: use FIRST occurrence for duplicates
+ layout_index_map = {}
+ if layout and 'nodeIndex' in layout:
+ for layout_idx, node_index in enumerate(layout['nodeIndex']):
+ if node_index not in layout_index_map: # Only store first occurrence
+ layout_index_map[node_index] = layout_idx
+
# Build snapshot lookup for each backend node id
for backend_node_id, snapshot_index in backend_node_to_snapshot_index.items():
is_clickable = None
@@ -98,8 +90,9 @@ def build_snapshot_lookup(
client_rects = None
scroll_rects = None
stacking_contexts = None
- for layout_idx, node_index in enumerate(layout.get('nodeIndex', [])):
- if node_index == snapshot_index and layout_idx < len(layout.get('bounds', [])):
+ if snapshot_index in layout_index_map:
+ layout_idx = layout_index_map[snapshot_index]
+ if layout_idx < len(layout.get('bounds', [])):
# Parse bounding box
bounds = layout['bounds'][layout_idx]
if len(bounds) >= 4:
@@ -153,8 +146,6 @@ def build_snapshot_lookup(
if layout_idx < len(layout.get('stackingContexts', [])):
stacking_contexts = layout.get('stackingContexts', {}).get('index', [])[layout_idx]
- break
-
snapshot_lookup[backend_node_id] = EnhancedSnapshotNode(
is_clickable=is_clickable,
cursor_style=cursor_style,
diff --git a/browser_use/filesystem/file_system.py b/browser_use/filesystem/file_system.py
index 5fc194783..c0cb7eaa7 100644
--- a/browser_use/filesystem/file_system.py
+++ b/browser_use/filesystem/file_system.py
@@ -6,8 +6,10 @@ from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any
-from markdown_pdf import MarkdownPdf, Section
from pydantic import BaseModel, Field
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
INVALID_FILENAME_ERROR_MESSAGE = 'Error: Invalid filename format. Must be alphanumeric with supported extension.'
DEFAULT_FILE_SYSTEM_PATH = 'browseruse_agent_data'
@@ -120,9 +122,32 @@ class PdfFile(BaseFile):
def sync_to_disk_sync(self, path: Path) -> None:
file_path = path / self.full_name
try:
- md_pdf = MarkdownPdf()
- md_pdf.add_section(Section(self.content))
- md_pdf.save(file_path)
+ # Create PDF document
+ doc = SimpleDocTemplate(str(file_path), pagesize=letter)
+ styles = getSampleStyleSheet()
+ story = []
+
+ # Convert markdown content to simple text and add to PDF
+ # For basic implementation, we'll treat content as plain text
+ # This avoids the AGPL license issue while maintaining functionality
+ content_lines = self.content.split('\n')
+
+ for line in content_lines:
+ if line.strip():
+ # Handle basic markdown headers
+ if line.startswith('# '):
+ para = Paragraph(line[2:], styles['Title'])
+ elif line.startswith('## '):
+ para = Paragraph(line[3:], styles['Heading1'])
+ elif line.startswith('### '):
+ para = Paragraph(line[4:], styles['Heading2'])
+ else:
+ para = Paragraph(line, styles['Normal'])
+ story.append(para)
+ else:
+ story.append(Spacer(1, 6))
+
+ doc.build(story)
except Exception as e:
raise FileSystemError(f"Error: Could not write to file '{self.full_name}'. {str(e)}")
diff --git a/browser_use/llm/__init__.py b/browser_use/llm/__init__.py
index f409f1839..badaef2eb 100644
--- a/browser_use/llm/__init__.py
+++ b/browser_use/llm/__init__.py
@@ -37,6 +37,41 @@ if TYPE_CHECKING:
from browser_use.llm.openai.chat import ChatOpenAI
from browser_use.llm.openrouter.chat import ChatOpenRouter
+ # Type stubs for model instances - enables IDE autocomplete
+ openai_gpt_4o: ChatOpenAI
+ openai_gpt_4o_mini: ChatOpenAI
+ openai_gpt_4_1_mini: ChatOpenAI
+ openai_o1: ChatOpenAI
+ openai_o1_mini: ChatOpenAI
+ openai_o1_pro: ChatOpenAI
+ openai_o3: ChatOpenAI
+ openai_o3_mini: ChatOpenAI
+ openai_o3_pro: ChatOpenAI
+ openai_o4_mini: ChatOpenAI
+ openai_gpt_5: ChatOpenAI
+ openai_gpt_5_mini: ChatOpenAI
+ openai_gpt_5_nano: ChatOpenAI
+
+ azure_gpt_4o: ChatAzureOpenAI
+ azure_gpt_4o_mini: ChatAzureOpenAI
+ azure_gpt_4_1_mini: ChatAzureOpenAI
+ azure_o1: ChatAzureOpenAI
+ azure_o1_mini: ChatAzureOpenAI
+ azure_o1_pro: ChatAzureOpenAI
+ azure_o3: ChatAzureOpenAI
+ azure_o3_mini: ChatAzureOpenAI
+ azure_o3_pro: ChatAzureOpenAI
+ azure_gpt_5: ChatAzureOpenAI
+ azure_gpt_5_mini: ChatAzureOpenAI
+
+ google_gemini_2_0_flash: ChatGoogle
+ google_gemini_2_0_pro: ChatGoogle
+ google_gemini_2_5_pro: ChatGoogle
+ google_gemini_2_5_flash: ChatGoogle
+ google_gemini_2_5_flash_lite: ChatGoogle
+
+# Models are imported on-demand via __getattr__
+
# Lazy imports mapping for heavy chat models
_LAZY_IMPORTS = {
'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'),
@@ -51,9 +86,12 @@ _LAZY_IMPORTS = {
'ChatOpenRouter': ('browser_use.llm.openrouter.chat', 'ChatOpenRouter'),
}
+# Cache for model instances - only created when accessed
+_model_cache: dict[str, 'BaseChatModel'] = {}
+
def __getattr__(name: str):
- """Lazy import mechanism for heavy chat model imports."""
+ """Lazy import mechanism for heavy chat model imports and model instances."""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
try:
@@ -61,12 +99,25 @@ def __getattr__(name: str):
module = import_module(module_path)
attr = getattr(module, attr_name)
- # Cache the imported attribute in the module's globals
- globals()[name] = attr
return attr
except ImportError as e:
raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e
+ # Check cache first for model instances
+ if name in _model_cache:
+ return _model_cache[name]
+
+ # Try to get model instances from models module on-demand
+ try:
+ from browser_use.llm.models import __getattr__ as models_getattr
+
+ attr = models_getattr(name)
+ # Cache in our clean cache dict
+ _model_cache[name] = attr
+ return attr
+ except (AttributeError, ImportError):
+ pass
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
diff --git a/browser_use/llm/google/chat.py b/browser_use/llm/google/chat.py
index 66ba74b90..fcc8f7624 100644
--- a/browser_use/llm/google/chat.py
+++ b/browser_use/llm/google/chat.py
@@ -73,10 +73,11 @@ class ChatGoogle(BaseChatModel):
# Model configuration
model: VerifiedGeminiModels | str
- temperature: float | None = None
+ temperature: float | None = 0.2
top_p: float | None = None
seed: int | None = None
thinking_budget: int | None = None
+ max_output_tokens: int | None = 4096
config: types.GenerateContentConfigDict | None = None
# Client initialization parameters
@@ -193,6 +194,9 @@ class ChatGoogle(BaseChatModel):
thinking_config_dict: types.ThinkingConfigDict = {'thinking_budget': self.thinking_budget}
config['thinking_config'] = thinking_config_dict
+ if self.max_output_tokens is not None:
+ config['max_output_tokens'] = self.max_output_tokens
+
async def _make_api_call():
if output_format is None:
# Return string response
@@ -389,6 +393,10 @@ class ChatGoogle(BaseChatModel):
):
cleaned['properties'] = {'_placeholder': {'type': 'string'}}
+ # Also remove 'title' from the required list if it exists
+ if 'required' in cleaned and isinstance(cleaned.get('required'), list):
+ cleaned['required'] = [p for p in cleaned['required'] if p != 'title']
+
return cleaned
elif isinstance(obj, list):
return [clean_schema(item) for item in obj]
diff --git a/browser_use/llm/models.py b/browser_use/llm/models.py
new file mode 100644
index 000000000..d09cd4c36
--- /dev/null
+++ b/browser_use/llm/models.py
@@ -0,0 +1,171 @@
+"""
+Convenient access to LLM models.
+
+Usage:
+ from browser_use import llm
+
+ # Simple model access
+ model = llm.azure_gpt_4_1_mini
+ model = llm.openai_gpt_4o
+ model = llm.google_gemini_2_5_pro
+"""
+
+import os
+from typing import TYPE_CHECKING
+
+from browser_use.llm.azure.chat import ChatAzureOpenAI
+from browser_use.llm.google.chat import ChatGoogle
+from browser_use.llm.openai.chat import ChatOpenAI
+
+if TYPE_CHECKING:
+ from browser_use.llm.base import BaseChatModel
+
+# Type stubs for IDE autocomplete
+openai_gpt_4o: 'BaseChatModel'
+openai_gpt_4o_mini: 'BaseChatModel'
+openai_gpt_4_1_mini: 'BaseChatModel'
+openai_o1: 'BaseChatModel'
+openai_o1_mini: 'BaseChatModel'
+openai_o1_pro: 'BaseChatModel'
+openai_o3: 'BaseChatModel'
+openai_o3_mini: 'BaseChatModel'
+openai_o3_pro: 'BaseChatModel'
+openai_o4_mini: 'BaseChatModel'
+openai_gpt_5: 'BaseChatModel'
+openai_gpt_5_mini: 'BaseChatModel'
+openai_gpt_5_nano: 'BaseChatModel'
+
+azure_gpt_4o: 'BaseChatModel'
+azure_gpt_4o_mini: 'BaseChatModel'
+azure_gpt_4_1_mini: 'BaseChatModel'
+azure_o1: 'BaseChatModel'
+azure_o1_mini: 'BaseChatModel'
+azure_o1_pro: 'BaseChatModel'
+azure_o3: 'BaseChatModel'
+azure_o3_mini: 'BaseChatModel'
+azure_o3_pro: 'BaseChatModel'
+azure_gpt_5: 'BaseChatModel'
+azure_gpt_5_mini: 'BaseChatModel'
+
+google_gemini_2_0_flash: 'BaseChatModel'
+google_gemini_2_0_pro: 'BaseChatModel'
+google_gemini_2_5_pro: 'BaseChatModel'
+google_gemini_2_5_flash: 'BaseChatModel'
+google_gemini_2_5_flash_lite: 'BaseChatModel'
+
+
+def get_llm_by_name(model_name: str):
+ """
+ Factory function to create LLM instances from string names with API keys from environment.
+
+ Args:
+ model_name: String name like 'azure_gpt_4_1_mini', 'openai_gpt_4o', etc.
+
+ Returns:
+ LLM instance with API keys from environment variables
+
+ Raises:
+ ValueError: If model_name is not recognized
+ """
+ if not model_name:
+ raise ValueError('Model name cannot be empty')
+
+ # Parse model name
+ parts = model_name.split('_', 1)
+ if len(parts) < 2:
+ raise ValueError(f"Invalid model name format: '{model_name}'. Expected format: 'provider_model_name'")
+
+ provider = parts[0]
+ model_part = parts[1]
+
+ # Convert underscores back to dots/dashes for actual model names
+ if 'gpt_4_1_mini' in model_part:
+ model = model_part.replace('gpt_4_1_mini', 'gpt-4.1-mini')
+ elif 'gpt_4o_mini' in model_part:
+ model = model_part.replace('gpt_4o_mini', 'gpt-4o-mini')
+ elif 'gpt_4o' in model_part:
+ model = model_part.replace('gpt_4o', 'gpt-4o')
+ elif 'gemini_2_0' in model_part:
+ model = model_part.replace('gemini_2_0', 'gemini-2.0').replace('_', '-')
+ elif 'gemini_2_5' in model_part:
+ model = model_part.replace('gemini_2_5', 'gemini-2.5').replace('_', '-')
+ else:
+ model = model_part.replace('_', '-')
+
+ # OpenAI Models
+ if provider == 'openai':
+ api_key = os.getenv('OPENAI_API_KEY')
+ return ChatOpenAI(model=model, api_key=api_key)
+
+ # Azure OpenAI Models
+ elif provider == 'azure':
+ api_key = os.getenv('AZURE_OPENAI_KEY') or os.getenv('AZURE_OPENAI_API_KEY')
+ azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
+ return ChatAzureOpenAI(model=model, api_key=api_key, azure_endpoint=azure_endpoint)
+
+ # Google Models
+ elif provider == 'google':
+ api_key = os.getenv('GOOGLE_API_KEY')
+ return ChatGoogle(model=model, api_key=api_key)
+
+ else:
+ available_providers = ['openai', 'azure', 'google']
+ raise ValueError(f"Unknown provider: '{provider}'. Available providers: {', '.join(available_providers)}")
+
+
+# Pre-configured model instances (lazy loaded via __getattr__)
+def __getattr__(name: str) -> 'BaseChatModel':
+ """Create model instances on demand with API keys from environment."""
+ # Handle chat classes first
+ if name == 'ChatOpenAI':
+ return ChatOpenAI # type: ignore
+ elif name == 'ChatAzureOpenAI':
+ return ChatAzureOpenAI # type: ignore
+ elif name == 'ChatGoogle':
+ return ChatGoogle # type: ignore
+
+ # Handle model instances - these are the main use case
+ try:
+ return get_llm_by_name(name)
+ except ValueError:
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+
+
+__all__ = [
+ 'ChatOpenAI',
+ 'ChatAzureOpenAI',
+ 'ChatGoogle',
+ 'get_llm_by_name',
+ # OpenAI instances - created on demand
+ 'openai_gpt_4o',
+ 'openai_gpt_4o_mini',
+ 'openai_gpt_4_1_mini',
+ 'openai_o1',
+ 'openai_o1_mini',
+ 'openai_o1_pro',
+ 'openai_o3',
+ 'openai_o3_mini',
+ 'openai_o3_pro',
+ 'openai_o4_mini',
+ 'openai_gpt_5',
+ 'openai_gpt_5_mini',
+ 'openai_gpt_5_nano',
+ # Azure instances - created on demand
+ 'azure_gpt_4o',
+ 'azure_gpt_4o_mini',
+ 'azure_gpt_4_1_mini',
+ 'azure_o1',
+ 'azure_o1_mini',
+ 'azure_o1_pro',
+ 'azure_o3',
+ 'azure_o3_mini',
+ 'azure_o3_pro',
+ 'azure_gpt_5',
+ 'azure_gpt_5_mini',
+ # Google instances - created on demand
+ 'google_gemini_2_0_flash',
+ 'google_gemini_2_0_pro',
+ 'google_gemini_2_5_pro',
+ 'google_gemini_2_5_flash',
+ 'google_gemini_2_5_flash_lite',
+]
diff --git a/browser_use/llm/ollama/chat.py b/browser_use/llm/ollama/chat.py
index cf6d86eef..99049b18a 100644
--- a/browser_use/llm/ollama/chat.py
+++ b/browser_use/llm/ollama/chat.py
@@ -1,8 +1,10 @@
+from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any, TypeVar, overload
import httpx
from ollama import AsyncClient as OllamaAsyncClient
+from ollama import Options
from pydantic import BaseModel
from browser_use.llm.base import BaseChatModel
@@ -30,6 +32,7 @@ class ChatOllama(BaseChatModel):
host: str | None = None
timeout: float | httpx.Timeout | None = None
client_params: dict[str, Any] | None = None
+ ollama_options: Mapping[str, Any] | Options | None = None
# Static
@property
@@ -70,6 +73,7 @@ class ChatOllama(BaseChatModel):
response = await self.get_client().chat(
model=self.model,
messages=ollama_messages,
+ options=self.ollama_options,
)
return ChatInvokeCompletion(completion=response.message.content or '', usage=None)
@@ -80,6 +84,7 @@ class ChatOllama(BaseChatModel):
model=self.model,
messages=ollama_messages,
format=schema,
+ options=self.ollama_options,
)
completion = response.message.content or ''
diff --git a/browser_use/llm/tests/test_gemini_image.py b/browser_use/llm/tests/test_gemini_image.py
index f8cfbd630..75c3e6e73 100644
--- a/browser_use/llm/tests/test_gemini_image.py
+++ b/browser_use/llm/tests/test_gemini_image.py
@@ -3,7 +3,6 @@ import base64
import io
import random
-from lmnr import Laminar
from PIL import Image, ImageDraw, ImageFont
from browser_use.llm.google.chat import ChatGoogle
@@ -17,8 +16,6 @@ from browser_use.llm.messages import (
UserMessage,
)
-Laminar.initialize()
-
def create_random_text_image(text: str = 'hello world', width: int = 4000, height: int = 4000) -> str:
# Create image with random background color
diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py
index 0ebfc65e0..863ce6f17 100644
--- a/browser_use/logging_config.py
+++ b/browser_use/logging_config.py
@@ -138,7 +138,7 @@ def setup_logging(stream=None, log_level=None, force_setup=False, debug_log_file
# Create debug log file handler
if debug_log_file:
- debug_handler = logging.FileHandler(debug_log_file)
+ debug_handler = logging.FileHandler(debug_log_file, encoding='utf-8')
debug_handler.setLevel(logging.DEBUG)
debug_handler.setFormatter(BrowserUseFormatter('%(asctime)s - %(levelname)-8s [%(name)s] %(message)s', logging.DEBUG))
file_handlers.append(debug_handler)
@@ -146,7 +146,7 @@ def setup_logging(stream=None, log_level=None, force_setup=False, debug_log_file
# Create info log file handler
if info_log_file:
- info_handler = logging.FileHandler(info_log_file)
+ info_handler = logging.FileHandler(info_log_file, encoding='utf-8')
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(BrowserUseFormatter('%(asctime)s - %(levelname)-8s [%(name)s] %(message)s', logging.INFO))
file_handlers.append(info_handler)
diff --git a/browser_use/tools/registry/service.py b/browser_use/tools/registry/service.py
index 3a148a6d8..418862c73 100644
--- a/browser_use/tools/registry/service.py
+++ b/browser_use/tools/registry/service.py
@@ -8,6 +8,7 @@ from inspect import Parameter, iscoroutinefunction, signature
from types import UnionType
from typing import Any, Generic, Optional, TypeVar, Union, get_args, get_origin
+import pyotp
from pydantic import BaseModel, Field, RootModel, create_model
from browser_use.browser import BrowserSession
@@ -433,10 +434,17 @@ class Registry(Generic[Context]):
def recursively_replace_secrets(value: str | dict | list) -> str | dict | list:
if isinstance(value, str):
matches = secret_pattern.findall(value)
-
+ # check if the placeholder key, like x_password is in the output parameters of the LLM and replace it with the sensitive data
for placeholder in matches:
if placeholder in applicable_secrets:
- value = value.replace(f'{placeholder}', applicable_secrets[placeholder])
+ # generate a totp code if secret is a 2fa secret
+ if 'bu_2fa_code' in placeholder:
+ totp = pyotp.TOTP(applicable_secrets[placeholder], digits=6)
+ replacement_value = totp.now()
+ else:
+ replacement_value = applicable_secrets[placeholder]
+
+ value = value.replace(f'{placeholder}', replacement_value)
replaced_placeholders.add(placeholder)
else:
# Keep track of missing placeholders
diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py
index 4ea335dd2..4c857fbfd 100644
--- a/browser_use/tools/service.py
+++ b/browser_use/tools/service.py
@@ -236,17 +236,17 @@ class Tools(Generic[Context]):
return ActionResult(error=error_msg)
@self.registry.action(
- 'Wait for x seconds default 3 (max 10 seconds). This can be used to wait until the page is fully loaded.'
+ 'Wait for x seconds (default 3) (max 30 seconds). This can be used to wait until the page is fully loaded.'
)
async def wait(seconds: int = 3):
- # Cap wait time at maximum 10 seconds
+ # Cap wait time at maximum 30 seconds
# Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds
# So if the model decides to wait for 5 seconds, the llm call took at least 3 seconds, so we only need to wait for 2 seconds
# Note by Mert: the above doesnt make sense because we do the LLM call right after this or this could be followed by another action after which we would like to wait
# so I revert this.
- actual_seconds = min(max(seconds, 0), 10)
- memory = f'Waited for {actual_seconds} seconds'
- logger.info(f'🕒 {memory}')
+ actual_seconds = min(max(seconds - 3, 0), 30)
+ memory = f'Waited for {seconds} seconds'
+ logger.info(f'🕒 waited for {actual_seconds} seconds + 3 seconds for LLM call')
await asyncio.sleep(actual_seconds)
return ActionResult(extracted_content=memory, long_term_memory=memory)
@@ -266,7 +266,7 @@ class Tools(Generic[Context]):
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
if node is None:
- raise ValueError(f'Element index {params.index} not found in DOM')
+ raise ValueError(f'Element index {params.index} not found in browser state')
event = browser_session.event_bus.dispatch(
ClickElementEvent(node=node, while_holding_ctrl=params.while_holding_ctrl or False)
@@ -315,7 +315,7 @@ class Tools(Generic[Context]):
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
if node is None:
- raise ValueError(f'Element index {params.index} not found in DOM')
+ raise ValueError(f'Element index {params.index} not found in browser state')
# Dispatch type text event with node
try:
@@ -325,7 +325,7 @@ class Tools(Generic[Context]):
await event
input_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
msg = f"Input '{params.text}' into element {params.index}."
- logger.info(msg)
+ logger.debug(msg)
# Include input coordinates in metadata if available
return ActionResult(
@@ -669,7 +669,9 @@ You will be given a query and the markdown of a webpage that has been filtered t
raise RuntimeError(str(e))
@self.registry.action(
- """Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 10.0 for ten pages, etc.). Optional index parameter to scroll within a specific element or its scroll container (works well for dropdowns and custom UI components). If you want to scroll the entire page, don't use index.
+ """Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 10.0 for ten pages, etc.).
+ Default behavior is to scroll the entire page. This is enough for most cases.
+ Optional if there are multiple scroll containers, use frame_element_index parameter with an element inside the container you want to scroll in. For that you must use indices that exist in your browser_state (works well for dropdowns and custom UI components).
Instead of scrolling step after step, use a high number of pages at once like 10 to get to the bottom of the page.
If you know where you want to scroll to, use scroll_to_text instead of this tool.
""",
@@ -681,18 +683,15 @@ You will be given a query and the markdown of a webpage that has been filtered t
# Special case: index 0 means scroll the whole page (root/body element)
node = None
if params.frame_element_index is not None and params.frame_element_index != 0:
- try:
- node = await browser_session.get_element_by_index(params.frame_element_index)
- if node is None:
- # Element not found - return error
- raise ValueError(f'Element index {params.frame_element_index} not found in DOM')
- except Exception as e:
- # Error getting element - return error
- raise ValueError(f'Failed to get element {params.frame_element_index}: {e}') from e
+ node = await browser_session.get_element_by_index(params.frame_element_index)
+ if node is None:
+ # Element does not exist
+ msg = f'Element index {params.frame_element_index} not found in browser state'
+ return ActionResult(error=msg)
# Dispatch scroll event with node - the complex logic is handled in the event handler
- # Convert pages to pixels (assuming 800px per page as standard viewport height)
- pixels = int(params.num_pages * 800)
+ # Convert pages to pixels (assuming 1000px per page as standard viewport height)
+ pixels = int(params.num_pages * 1000)
event = browser_session.event_bus.dispatch(
ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node)
)
@@ -765,7 +764,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
# Dropdown Actions
@self.registry.action(
- 'Get list of option values exposed by a specific dropdown input field. Only works on dropdown-style form elements (