Merge branch 'main' into feature/mistral-support

This commit is contained in:
Mert Unsal
2025-11-29 14:30:08 -08:00
committed by GitHub
24 changed files with 929 additions and 2110 deletions

View File

@@ -38,9 +38,11 @@
</br>
🌤️ Want to skip the setup? Use our <b>[cloud](https://cloud.browser-use.com)</b> for faster, scalable, stealth-enabled browser automation!
# 🤖 LLM Quickstart
1. Direct your favorite coding agent (Cursor, ClaudeS, etc) to [Agents.md](https://docs.browser-use.com/llms-full.txt)
1. Direct your favorite coding agent (Cursor, Claude Code, etc) to [Agents.md](https://docs.browser-use.com/llms-full.txt)
2. Prompt away!
<br/>

View File

@@ -32,7 +32,7 @@ class HistoryItem(BaseModel):
def to_string(self) -> str:
"""Get string representation of the history item"""
step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
step_str = 'step' if self.step_number is not None else 'step_unknown'
if self.error:
return f"""<{step_str}>

View File

@@ -43,13 +43,8 @@ class SystemPrompt:
"""Load the prompt template from the markdown file."""
try:
# Choose the appropriate template based on flash_mode, use_thinking, and is_anthropic
if self.is_anthropic:
if self.flash_mode:
template_filename = 'system_prompt_flash_anthropic.md'
elif self.use_thinking:
template_filename = 'system_prompt_anthropic.md'
else:
template_filename = 'system_prompt_no_thinking.md'
if self.flash_mode and self.is_anthropic:
template_filename = 'system_prompt_flash_anthropic.md'
elif self.flash_mode:
template_filename = 'system_prompt_flash.md'
elif self.use_thinking:

View File

@@ -213,16 +213,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if llm.provider == 'browser-use':
flash_mode = True
# Auto-configure llm_screenshot_size for specific models
# Auto-configure llm_screenshot_size for Claude Sonnet models
if llm_screenshot_size is None:
model_name = getattr(llm, 'model', '')
if isinstance(model_name, str):
if model_name.startswith('claude-sonnet'):
llm_screenshot_size = (1400, 850)
logger.info('🖼️ Auto-configured LLM screenshot size for Claude Sonnet: 1400x850')
elif 'gemini' in model_name.lower():
llm_screenshot_size = (1024, 720)
logger.info('🖼️ Auto-configured LLM screenshot size for Gemini: 1024x720')
if isinstance(model_name, str) and model_name.startswith('claude-sonnet'):
llm_screenshot_size = (1400, 850)
logger.info('🖼️ Auto-configured LLM screenshot size for Claude Sonnet: 1400x850')
if page_extraction_llm is None:
page_extraction_llm = llm
@@ -934,6 +930,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Log step completion summary
summary_message = self._log_step_completion_summary(self.step_start_time, self.state.last_result)
if summary_message:
await self._demo_mode_log(summary_message, 'info', {'step': self.state.n_steps})
# Save file system state after step completion
self.save_file_system_state()
@@ -1772,6 +1770,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if on_step_start is not None:
await on_step_start(self)
await self._demo_mode_log(
f'Starting step {step + 1}/{max_steps}',
'info',
{'step': step + 1, 'total_steps': max_steps},
)
self.logger.debug(f'🚶 Starting step {step + 1}/{max_steps}...')
try:

View File

@@ -178,7 +178,7 @@ You must ALWAYS respond with a valid JSON in this exact format:
"thinking": "A structured <think>-style reasoning block that applies the <reasoning_rules> provided above.",
"evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
"memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence."
"action":[{{"navigate": {{ "url": "url_value"}}}}, // ... more actions in sequence]
}}
Action list should NEVER be empty.

View File

@@ -1,34 +0,0 @@
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
<user_request>
User request is the ultimate objective. For tasks with specific instructions, follow each step. For open-ended tasks, plan your own approach.
</user_request>
<browser_state>
Elements: [index]<type>text</type>. Only [indexed] are interactive. Indentation=child. *[=new.
</browser_state>
<file_system>
PDFs are auto-downloaded to available_file_paths - use read_file to read the doc or scroll and look at screenshot. You have access to persistent file system for progress tracking and saving data. Long tasks >10 steps: use todo.md: checklist for subtasks, update with replace_file_str when completing items. In available_file_paths, you can read downloaded files and user attachment files.
</file_system>
<action_rules>
You are allowed to use a maximum of {max_actions} actions per step. Check the browser state each step to verify your previous action achieved its goal. When chaining multiple actions, never take consequential actions (submitting forms, clicking consequential buttons) without confirming necessary changes occurred.
Default to element indices for browser interaction. If the target index is missing or a prior index-based action failed, use screenshot coordinates instead—DOM extraction doesn't capture everything. Coordinate interaction is useful when DOM extraction fails such as interacting with Canvas, scrolling on sidebars, etc.
</action_rules>
<output>You must call the AgentOutput tool with the following schema for the arguments:
{{
"thinking": "A structured <think>-style reasoning block to analyze the current state, agent history, and plan the next goals. Analyze what happened in the last few steps.",
"evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
"memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.",
"action": [
{{
"action_name": {{
"parameter1": "value1",
"parameter2": "value2"
}}
}}
]
}}
Action list should NEVER be empty.
</output>

View File

@@ -1,33 +0,0 @@
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
<user_request>
User request is the ultimate objective. For tasks with specific instructions, follow each step. For open-ended tasks, plan your own approach.
</user_request>
<browser_state>
Elements: [index]<type>text</type>. Only [indexed] are interactive. Indentation=child. *[=new.
</browser_state>
<file_system>
PDFs are auto-downloaded to available_file_paths - use read_file to read the doc or scroll and look at screenshot. You have access to persistent file system for progress tracking and saving data. Long tasks >10 steps: use todo.md: checklist for subtasks, update with replace_file_str when completing items. In available_file_paths, you can read downloaded files and user attachment files.
</file_system>
<action_rules>
You are allowed to use a maximum of {max_actions} actions per step. Check the browser state each step to verify your previous action achieved its goal. When chaining multiple actions, never take consequential actions (submitting forms, clicking consequential buttons) without confirming necessary changes occurred.
Default to element indices for browser interaction. If the target index is missing or a prior index-based action failed, use screenshot coordinates instead—DOM extraction doesn't capture everything. Coordinate interaction is useful when DOM extraction fails such as interacting with Canvas, scrolling on sidebars, etc.
</action_rules>
<output>You must call the AgentOutput tool with the following schema for the arguments:
{{
"evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
"memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.",
"action": [
{{
"action_name": {{
"parameter1": "value1",
"parameter2": "value2"
}}
}}
]
}}
Action list should NEVER be empty.
</output>

View File

@@ -10,8 +10,6 @@ PDFs are auto-downloaded to available_file_paths - use read_file to read the doc
</file_system>
<action_rules>
You are allowed to use a maximum of {max_actions} actions per step. Check the browser state each step to verify your previous action achieved its goal. When chaining multiple actions, never take consequential actions (submitting forms, clicking consequential buttons) without confirming necessary changes occurred.
Default to element indices for browser interaction. If the target index is missing or a prior index-based action failed, use screenshot coordinates instead—DOM extraction doesn't capture everything. Coordinate interaction is useful when DOM extraction fails such as interacting with Canvas, scrolling on sidebars, etc.
</action_rules>
<output>You must call the AgentOutput tool with the following schema for the arguments:

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -166,17 +166,6 @@ class ScrollEvent(ElementSelectedEvent[None]):
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_ScrollEvent', 8.0)) # seconds
class ScrollAtCoordinateEvent(BaseEvent[None]):
"""Scroll at specific coordinates using mouse wheel."""
coordinate_x: int
coordinate_y: int
scroll_x: int = 0 # deltaX (positive=right, negative=left)
scroll_y: int = 0 # deltaY (positive=down, negative=up)
event_timeout: float | None = _get_timeout('TIMEOUT_ScrollAtCoordinateEvent', 8.0) # seconds
class SwitchTabEvent(BaseEvent[TargetID]):
"""Switch to a different tab."""

View File

@@ -595,10 +595,6 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
default=False,
description='Enable demo mode side panel that streams agent logs directly inside the browser window (requires headless=False).',
)
demo_mode_display: Literal['full', 'last'] = Field(
default='last',
description="Display mode for demo panel: 'full' shows complete log panel, 'last' shows only latest action and memory in bottom-right corner",
)
cookie_whitelist_domains: list[str] = Field(
default_factory=lambda: ['nature.com', 'qatarairways.com'],
description='List of domains to whitelist in the "I still don\'t care about cookies" extension, preventing automatic cookie banner handling on these sites.',

View File

@@ -436,7 +436,6 @@ class BrowserSession(BaseModel):
_cloud_browser_client: CloudBrowserClient = PrivateAttr(default_factory=lambda: CloudBrowserClient())
_demo_mode: 'DemoMode | None' = PrivateAttr(default=None)
_demo_nav_handler_event_bus: EventBus | None = PrivateAttr(default=None)
_logger: Any = PrivateAttr(default=None)
@@ -518,13 +517,15 @@ class BrowserSession(BaseModel):
self._demo_mode.reset()
self._demo_mode = None
self.logger.debug('✅ Browser session reset complete')
self.logger.info('✅ Browser session reset complete')
def model_post_init(self, __context) -> None:
"""Register event handlers after model initialization."""
self._connection_lock = asyncio.Lock()
# Check if handlers are already registered to prevent duplicates
from browser_use.browser.watchdog_base import BaseWatchdog
start_handlers = self.event_bus.handlers.get('BrowserStartEvent', [])
start_handler_names = [getattr(h, '__name__', str(h)) for h in start_handlers]
@@ -535,16 +536,9 @@ class BrowserSession(BaseModel):
'This likely means BrowserSession was initialized multiple times with the same EventBus.'
)
self._register_essential_handlers()
def _register_essential_handlers(self) -> None:
"""Register all essential event handlers on the current event bus."""
from browser_use.browser.watchdog_base import BaseWatchdog
BaseWatchdog.attach_handler_to_session(self, BrowserStartEvent, self.on_BrowserStartEvent)
BaseWatchdog.attach_handler_to_session(self, BrowserStopEvent, self.on_BrowserStopEvent)
BaseWatchdog.attach_handler_to_session(self, NavigateToUrlEvent, self.on_NavigateToUrlEvent)
self._ensure_demo_mode_handlers()
BaseWatchdog.attach_handler_to_session(self, SwitchTabEvent, self.on_SwitchTabEvent)
BaseWatchdog.attach_handler_to_session(self, TabCreatedEvent, self.on_TabCreatedEvent)
BaseWatchdog.attach_handler_to_session(self, TabClosedEvent, self.on_TabClosedEvent)
@@ -552,14 +546,6 @@ class BrowserSession(BaseModel):
BaseWatchdog.attach_handler_to_session(self, FileDownloadedEvent, self.on_FileDownloadedEvent)
BaseWatchdog.attach_handler_to_session(self, CloseTabEvent, self.on_CloseTabEvent)
def _ensure_demo_mode_handlers(self) -> None:
"""Ensure demo mode handlers are attached to the active event bus."""
if self._demo_nav_handler_event_bus is self.event_bus:
return
self.event_bus.on(NavigationCompleteEvent, self._on_demo_mode_navigation_complete)
self._demo_nav_handler_event_bus = self.event_bus
@observe_debug(ignore_input=True, ignore_output=True, name='browser_session_start')
async def start(self) -> None:
"""Start the browser session."""
@@ -586,8 +572,6 @@ class BrowserSession(BaseModel):
await self.reset()
# Create fresh event bus
self.event_bus = EventBus()
# Re-register all essential handlers on the new event bus
self._register_essential_handlers()
async def stop(self) -> None:
"""Stop the browser session without killing the browser process.
@@ -612,8 +596,6 @@ class BrowserSession(BaseModel):
await self.reset()
# Create fresh event bus
self.event_bus = EventBus()
# Re-register all essential handlers on the new event bus
self._register_essential_handlers()
@observe_debug(ignore_input=True, ignore_output=True, name='browser_start_event_handler')
async def on_BrowserStartEvent(self, event: BrowserStartEvent) -> dict[str, str]:
@@ -904,20 +886,6 @@ class BrowserSession(BaseModel):
else:
self.logger.warning(f'⚠️ Page readiness timeout ({timeout}s, {duration_ms:.0f}ms) for {url}')
async def _on_demo_mode_navigation_complete(self, event: NavigationCompleteEvent) -> None:
"""Rehydrate the demo overlay and logs after navigation."""
if not self.browser_profile.demo_mode:
return
demo = self.demo_mode
if not demo:
return
try:
await demo.refresh_target(event.target_id)
except Exception as exc:
self.logger.debug(f'[DemoMode] Failed to refresh overlay for target {event.target_id[:8]}...: {exc}')
async def on_SwitchTabEvent(self, event: SwitchTabEvent) -> TargetID:
"""Handle tab switching - core browser functionality."""
if not self.agent_focus_target_id:
@@ -1075,7 +1043,7 @@ class BrowserSession(BaseModel):
self.logger.debug(f'Failed to cleanup cloud browser session: {e}')
# Clear CDP session cache before stopping
self.logger.debug(
self.logger.info(
f'📢 on_BrowserStopEvent - Calling reset() (force={event.force}, keep_alive={self.browser_profile.keep_alive})'
)
await self.reset()
@@ -2957,19 +2925,19 @@ class BrowserSession(BaseModel):
"""Clear geolocation override using CDP."""
await self.cdp_client.send.Emulation.clearGeolocationOverride()
async def _cdp_add_init_script(self, script: str, target_id: TargetID | None = None) -> str:
"""Add script to evaluate on new document for a specific target."""
async def _cdp_add_init_script(self, script: str) -> str:
"""Add script to evaluate on new document using CDP Page.addScriptToEvaluateOnNewDocument."""
assert self._cdp_client_root is not None
cdp_session = await self.get_or_create_cdp_session(target_id=target_id, focus=target_id is None)
cdp_session = await self.get_or_create_cdp_session()
result = await cdp_session.cdp_client.send.Page.addScriptToEvaluateOnNewDocument(
params={'source': script, 'runImmediately': True}, session_id=cdp_session.session_id
)
return result['identifier']
async def _cdp_remove_init_script(self, identifier: str, target_id: TargetID | None = None) -> None:
"""Remove script added with addScriptToEvaluateOnNewDocument for a target."""
cdp_session = await self.get_or_create_cdp_session(target_id=target_id, focus=target_id is None)
async def _cdp_remove_init_script(self, identifier: str) -> None:
"""Remove script added with addScriptToEvaluateOnNewDocument."""
cdp_session = await self.get_or_create_cdp_session(target_id=None)
await cdp_session.cdp_client.send.Page.removeScriptToEvaluateOnNewDocument(
params={'identifier': identifier}, session_id=cdp_session.session_id
)

View File

@@ -181,7 +181,7 @@ class SessionManager:
self._target_sessions.clear()
self._session_to_target.clear()
self.logger.debug('[SessionManager] Cleared all owned data (targets, sessions, mappings)')
self.logger.info('[SessionManager] Cleared all owned data (targets, sessions, mappings)')
async def is_target_valid(self, target_id: TargetID) -> bool:
"""Check if a target is still valid and has active sessions.
@@ -458,14 +458,6 @@ class SessionManager:
except Exception as e:
self.logger.warning(f'[SessionManager] Failed to resume execution: {e}')
if target_type in ('page', 'tab') and self.browser_session.browser_profile.demo_mode:
demo = self.browser_session.demo_mode
if demo:
try:
await demo.register_new_target(target_id)
except Exception as exc:
self.logger.debug(f'[SessionManager] Failed to register demo overlay for {target_id[:8]}...: {exc}')
async def _handle_target_info_changed(self, event: dict) -> None:
"""Handle Target.targetInfoChanged event.
@@ -478,30 +470,13 @@ class SessionManager:
if not target_id:
return
url_changed = False
target_type = None
async with self._lock:
# Update target if it exists (source of truth for url/title)
if target_id in self._targets:
target = self._targets[target_id]
target_type = target.target_type
previous_url = target.url
new_url = target_info.get('url', previous_url)
target.title = target_info.get('title', target.title)
target.url = new_url
url_changed = previous_url != new_url
if url_changed and target_type in ('page', 'tab') and self.browser_session.browser_profile.demo_mode:
demo = self.browser_session.demo_mode
if demo:
try:
await demo.refresh_target(target_id)
except Exception as exc:
self.logger.debug(
f'[SessionManager] Failed to refresh demo overlay after URL change for {target_id[:8]}...: {exc}'
)
target.url = target_info.get('url', target.url)
async def _handle_target_detached(self, event: DetachedFromTargetEvent) -> None:
"""Handle Target.detachedFromTarget event.
@@ -592,9 +567,6 @@ class SessionManager:
self.browser_session.event_bus.dispatch(TabClosedEvent(target_id=target_id))
self.logger.debug(f'[SessionManager] Dispatched TabClosedEvent for page target {target_id[:8]}...')
demo = self.browser_session.demo_mode
if demo:
demo.unregister_target(target_id)
elif target_type:
self.logger.debug(
f'[SessionManager] Target {target_id[:8]}... fully removed (type={target_type}) - not dispatching TabClosedEvent'

View File

@@ -13,7 +13,6 @@ from browser_use.browser.events import (
GoBackEvent,
GoForwardEvent,
RefreshEvent,
ScrollAtCoordinateEvent,
ScrollEvent,
ScrollToTextEvent,
SelectDropdownOptionEvent,
@@ -384,46 +383,6 @@ class DefaultActionWatchdog(BaseWatchdog):
except Exception as e:
raise
async def on_ScrollAtCoordinateEvent(self, event: ScrollAtCoordinateEvent) -> None:
"""Handle scroll at specific coordinates using CDP synthesizeScrollGesture."""
# Check if we have a current target for scrolling
if not self.browser_session.agent_focus_target_id:
error_msg = 'No active target for scrolling'
raise BrowserError(error_msg)
try:
# Get focused CDP session
cdp_session = await self.browser_session.get_or_create_cdp_session()
cdp_client = cdp_session.cdp_client
session_id = cdp_session.session_id
# Convert scroll deltas to gesture distances
# Note: synthesizeScrollGesture uses opposite directions:
# - positive yDistance = scroll UP (opposite of mouseWheel deltaY)
# - positive xDistance = scroll LEFT (opposite of mouseWheel deltaX)
# So we negate the values to maintain the same behavior as before
params: dict[str, float] = {
'x': float(event.coordinate_x),
'y': float(event.coordinate_y),
}
if event.scroll_x != 0:
params['xDistance'] = float(-event.scroll_x)
if event.scroll_y != 0:
params['yDistance'] = float(-event.scroll_y)
# Synthesize scroll gesture at the specified coordinates
await cdp_client.send.Input.synthesizeScrollGesture(
params=params, # type: ignore[arg-type]
session_id=session_id,
)
self.logger.debug(
f'📄 Scrolled at ({event.coordinate_x}, {event.coordinate_y}) by deltaX={event.scroll_x}, deltaY={event.scroll_y}'
)
return None
except Exception as e:
raise
# ========== Implementation Methods ==========
async def _check_element_occlusion(self, backend_node_id: int, x: float, y: float, cdp_session) -> bool:
@@ -961,6 +920,8 @@ class DefaultActionWatchdog(BaseWatchdog):
},
session_id=cdp_session.session_id,
)
# Add 10ms delay between keystrokes
await asyncio.sleep(0.010)
except Exception as e:
raise Exception(f'Failed to type to page: {str(e)}')
@@ -2263,6 +2224,9 @@ class DefaultActionWatchdog(BaseWatchdog):
session_id=cdp_session.session_id,
)
# Small delay between characters (10ms)
await asyncio.sleep(0.010)
self.logger.info(f'⌨️ Sent keys: {event.keys}')
# Note: We don't clear cached state on Enter; multi_act will detect DOM changes

View File

@@ -50,7 +50,7 @@ class ScreenshotWatchdog(BaseWatchdog):
raise BrowserError('[ScreenshotWatchdog] No page targets available for screenshot')
target_id = page_targets[-1].target_id
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=True)
# Prepare screenshot parameters
params = CaptureScreenshotParameters(format='png', captureBeyondViewport=False)

View File

@@ -295,6 +295,7 @@ class CodeAgent:
# Main execution loop
for step in range(self.max_steps):
logger.info(f'\n\n\n\n\n\n\nStep {step + 1}/{self.max_steps}')
await self._demo_mode_log(f'Starting step {step + 1}/{self.max_steps}', 'info', {'step': step + 1})
# Start timing this step
self._step_start_time = datetime.datetime.now().timestamp()

View File

@@ -189,6 +189,8 @@ class ChatVercel(BaseChatModel):
prompt-based JSON extraction. Auto-detects common reasoning models by default.
timeout: Request timeout in seconds
max_retries: Maximum number of retries for failed requests
provider_options: Provider routing options for the gateway. Use this to control which
providers are used and in what order. Example: {'gateway': {'order': ['vertex', 'anthropic']}}
"""
# Model configuration
@@ -218,6 +220,7 @@ class ChatVercel(BaseChatModel):
default_query: Mapping[str, object] | None = None
http_client: httpx.AsyncClient | None = None
_strict_response_validation: bool = False
provider_options: dict[str, Any] | None = None
# Static
@property
@@ -382,6 +385,8 @@ class ChatVercel(BaseChatModel):
model_params['max_tokens'] = self.max_tokens
if self.top_p is not None:
model_params['top_p'] = self.top_p
if self.provider_options:
model_params['extra_body'] = {'providerOptions': self.provider_options}
if output_format is None:
# Return string response
@@ -400,11 +405,12 @@ class ChatVercel(BaseChatModel):
else:
is_google_model = self.model.startswith('google/')
is_anthropic_model = self.model.startswith('anthropic/')
is_reasoning_model = self.reasoning_models and any(
str(pattern).lower() in str(self.model).lower() for pattern in self.reasoning_models
)
if is_google_model or is_reasoning_model:
if is_google_model or is_anthropic_model or is_reasoning_model:
modified_messages = [m.model_copy(deep=True) for m in messages]
schema = SchemaOptimizer.create_gemini_optimized_schema(output_format)
@@ -431,10 +437,14 @@ class ChatVercel(BaseChatModel):
vercel_messages = VercelMessageSerializer.serialize_messages(modified_messages)
request_params = model_params.copy()
if self.provider_options:
request_params['extra_body'] = {'providerOptions': self.provider_options}
response = await self.get_client().chat.completions.create(
model=self.model,
messages=vercel_messages,
**model_params,
**request_params,
)
content = response.choices[0].message.content if response.choices else None
@@ -479,6 +489,10 @@ class ChatVercel(BaseChatModel):
'schema': schema,
}
request_params = model_params.copy()
if self.provider_options:
request_params['extra_body'] = {'providerOptions': self.provider_options}
response = await self.get_client().chat.completions.create(
model=self.model,
messages=vercel_messages,
@@ -486,7 +500,7 @@ class ChatVercel(BaseChatModel):
json_schema=response_format_schema,
type='json_schema',
),
**model_params,
**request_params,
)
content = response.choices[0].message.content if response.choices else None

View File

@@ -19,7 +19,6 @@ from browser_use.browser.events import (
GetDropdownOptionsEvent,
GoBackEvent,
NavigateToUrlEvent,
ScrollAtCoordinateEvent,
ScrollEvent,
ScrollToTextEvent,
SendKeysEvent,
@@ -45,7 +44,6 @@ from browser_use.tools.views import (
NavigateAction,
NoParamsAction,
ScrollAction,
ScrollAtCoordinateAction,
SearchAction,
SelectDropdownOptionAction,
SendKeysAction,
@@ -62,7 +60,6 @@ logger = logging.getLogger(__name__)
ClickElementEvent.model_rebuild()
TypeTextEvent.model_rebuild()
ScrollEvent.model_rebuild()
ScrollAtCoordinateEvent.model_rebuild()
UploadFileEvent.model_rebuild()
Context = TypeVar('Context')
@@ -254,25 +251,6 @@ class Tools(Generic[Context]):
return actual_x, actual_y
return llm_x, llm_y
def _convert_llm_scroll_deltas_to_viewport(
llm_scroll_x: int, llm_scroll_y: int, browser_session: BrowserSession
) -> tuple[int, int]:
"""Convert scroll deltas from LLM screenshot size to original viewport size."""
if browser_session.llm_screenshot_size and browser_session._original_viewport_size:
original_width, original_height = browser_session._original_viewport_size
llm_width, llm_height = browser_session.llm_screenshot_size
# Scale scroll deltas using the same ratio as coordinates
actual_scroll_x = int((llm_scroll_x / llm_width) * original_width)
actual_scroll_y = int((llm_scroll_y / llm_height) * original_height)
logger.info(
f'🔄 Scaling scroll deltas: LLM ({llm_scroll_x}, {llm_scroll_y}) @ {llm_width}x{llm_height} '
f'→ Viewport ({actual_scroll_x}, {actual_scroll_y}) @ {original_width}x{original_height}'
)
return actual_scroll_x, actual_scroll_y
return llm_scroll_x, llm_scroll_y
# Element Interaction Actions
async def _click_by_coordinate(params: ClickElementAction, browser_session: BrowserSession) -> ActionResult:
# Ensure coordinates are provided (type safety)
@@ -374,7 +352,7 @@ class Tools(Generic[Context]):
return ActionResult(error=error_msg)
@self.registry.action(
'Click element by index or coordinates',
'Click element by index or coordinates. Prefer index over coordinates when possible. Either provide coordinates or index.',
param_model=ClickElementAction,
)
async def click(params: ClickElementAction, browser_session: BrowserSession):
@@ -390,7 +368,7 @@ class Tools(Generic[Context]):
return await _click_by_coordinate(params, browser_session)
@self.registry.action(
'Input text into element with index.',
'Input text into element with index. Only works with index, NEVER use coordinates for inputting text.',
param_model=InputTextAction,
)
async def input(
@@ -810,14 +788,25 @@ You will be given a query and the markdown of a webpage that has been filtered t
raise RuntimeError(str(e))
@self.registry.action(
"""Scroll by pages where one page = viewport height. Set down=True to scroll down, down=False to scroll up. Defaults to scrolling down one page.""",
"""Scroll by pages. REQUIRED: down=True/False (True=scroll down, False=scroll up, default=True). Optional: pages=0.5-10.0 (default 1.0). Use index for scroll containers (dropdowns/custom UI). High pages (10) reaches bottom. Multi-page scrolls sequentially. Viewport-based height, fallback 1000px/page.""",
param_model=ScrollAction,
)
async def scroll(params: ScrollAction, browser_session: BrowserSession):
try:
direction = 'down' if params.down else 'up'
# Look up the node from the selector map if index is provided
# Special case: index 0 means scroll the whole page (root/body element)
node = None
if params.index is not None and params.index != 0:
node = await browser_session.get_element_by_index(params.index)
if node is None:
# Element does not exist
msg = f'Element index {params.index} not found in browser state'
return ActionResult(error=msg)
# Get actual viewport height for scrolling
direction = 'down' if params.down else 'up'
target = f'element {params.index}' if params.index is not None and params.index != 0 else ''
# Get actual viewport height for more accurate scrolling
try:
cdp_session = await browser_session.get_or_create_cdp_session()
metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
@@ -834,12 +823,6 @@ You will be given a query and the markdown of a webpage that has been filtered t
viewport_height = 1000 # Fallback to 1000px
logger.debug(f'Failed to get viewport height, using fallback 1000px: {e}')
# For reporting to LLM, use LLM screenshot height if available (so LLM's mental model matches)
if browser_session.llm_screenshot_size:
_, llm_viewport_height = browser_session.llm_screenshot_size
else:
llm_viewport_height = viewport_height
# For multiple pages (>=1.0), scroll one page at a time to ensure each scroll completes
if params.pages >= 1.0:
import asyncio
@@ -857,7 +840,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
pixels = -pixels
event = browser_session.event_bus.dispatch(
ScrollEvent(direction=direction, amount=abs(pixels), node=None)
ScrollEvent(direction=direction, amount=abs(pixels), node=node)
)
await event
await event.event_result(raise_if_any=True, raise_if_none=False)
@@ -878,7 +861,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
pixels = -pixels
event = browser_session.event_bus.dispatch(
ScrollEvent(direction=direction, amount=abs(pixels), node=None)
ScrollEvent(direction=direction, amount=abs(pixels), node=node)
)
await event
await event.event_result(raise_if_any=True, raise_if_none=False)
@@ -888,19 +871,18 @@ You will be given a query and the markdown of a webpage that has been filtered t
logger.warning(f'Fractional scroll failed: {e}')
if params.pages == 1.0:
long_term_memory = f'Scrolled {direction} {llm_viewport_height}px'
long_term_memory = f'Scrolled {direction} {target} {viewport_height}px'.replace(' ', ' ')
else:
long_term_memory = f'Scrolled {direction} {completed_scrolls:.1f} pages'
long_term_memory = f'Scrolled {direction} {target} {completed_scrolls:.1f} pages'.replace(' ', ' ')
else:
# For fractional pages <1.0, do single scroll
pixels = int(params.pages * viewport_height)
event = browser_session.event_bus.dispatch(
ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=None)
ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node)
)
await event
await event.event_result(raise_if_any=True, raise_if_none=False)
llm_pixels = int(params.pages * llm_viewport_height)
long_term_memory = f'Scrolled {direction} {llm_pixels}px'
long_term_memory = f'Scrolled {direction} {target} {params.pages} pages'.replace(' ', ' ')
msg = f'🔍 {long_term_memory}'
logger.info(msg)
@@ -910,54 +892,6 @@ You will be given a query and the markdown of a webpage that has been filtered t
error_msg = 'Failed to execute scroll action.'
return ActionResult(error=error_msg)
@self.registry.action(
'Scroll at specific coordinates. Use when you need to scroll within a specific area (e.g., scrollable containers, maps, modals) not reachable via element index.',
param_model=ScrollAtCoordinateAction,
)
async def scroll_at_coordinates(params: ScrollAtCoordinateAction, browser_session: BrowserSession):
try:
# Convert coordinates from LLM screenshot size to viewport size
actual_x, actual_y = _convert_llm_coordinates_to_viewport(
params.coordinate_x, params.coordinate_y, browser_session
)
# Convert scroll deltas from LLM screenshot size to viewport size
actual_scroll_x, actual_scroll_y = _convert_llm_scroll_deltas_to_viewport(
params.scroll_x, params.scroll_y, browser_session
)
# Dispatch scroll at coordinate event
event = browser_session.event_bus.dispatch(
ScrollAtCoordinateEvent(
coordinate_x=actual_x,
coordinate_y=actual_y,
scroll_x=actual_scroll_x,
scroll_y=actual_scroll_y,
)
)
await event
await event.event_result(raise_if_any=True, raise_if_none=False)
# Build memory with scroll amounts
direction_parts = []
if params.scroll_y > 0:
direction_parts.append(f'down {params.scroll_y}px')
elif params.scroll_y < 0:
direction_parts.append(f'up {abs(params.scroll_y)}px')
if params.scroll_x > 0:
direction_parts.append(f'right {params.scroll_x}px')
elif params.scroll_x < 0:
direction_parts.append(f'left {abs(params.scroll_x)}px')
scroll_desc = ' and '.join(direction_parts) if direction_parts else 'zero'
memory = f'Scrolled {scroll_desc} at ({params.coordinate_x}, {params.coordinate_y})'
msg = f'🔍 {memory}'
logger.info(msg)
return ActionResult(extracted_content=msg, long_term_memory=memory)
except Exception as e:
logger.error(f'Failed to scroll at coordinates: {type(e).__name__}: {e}')
return ActionResult(error=f'Failed to scroll at ({params.coordinate_x}, {params.coordinate_y})')
@self.registry.action(
'',
param_model=SendKeysAction,

View File

@@ -73,7 +73,8 @@ class CloseTabAction(BaseModel):
class ScrollAction(BaseModel):
down: bool = Field(default=True, description='down=True=scroll down, down=False scroll up')
pages: float = Field(default=1.0, description='0.5=half page, 1=full page, 2=two pages, etc.')
pages: float = Field(default=1.0, description='0.5=half page, 1=full page, 10=to bottom/top')
index: int | None = Field(default=None, description='Optional element index to scroll within specific container')
class SendKeysAction(BaseModel):
@@ -96,10 +97,3 @@ class GetDropdownOptionsAction(BaseModel):
class SelectDropdownOptionAction(BaseModel):
index: int
text: str = Field(description='exact text/value')
class ScrollAtCoordinateAction(BaseModel):
coordinate_x: int = Field(description='Horizontal coordinate relative to viewport left edge')
coordinate_y: int = Field(description='Vertical coordinate relative to viewport top edge')
scroll_x: int = Field(default=0, description='Horizontal scroll delta (positive=right, negative=left)')
scroll_y: int = Field(default=0, description='Vertical scroll delta (positive=down, negative=up)')

View File

@@ -143,15 +143,21 @@ Sandboxes are the **easiest way to run Browser-Use in production**. We handle ag
To run in production with authentication, just add `@sandbox` to your function:
```python
import asyncio
from browser_use import Browser, sandbox, ChatBrowserUse
from browser_use.agent.service import Agent
@sandbox(cloud_profile_id='your-profile-id')
async def production_task(browser: Browser):
agent = Agent(task="Your authenticated task", browser=browser, llm=ChatBrowserUse())
agent = Agent(
task="Your authenticated task",
browser=browser,
llm=ChatBrowserUse(),
)
await agent.run()
await production_task()
if __name__ == "__main__":
asyncio.run(production_task())
```
See [Going to Production](/production) for how to sync your cookies to the cloud.

View File

@@ -358,12 +358,24 @@ api_key = os.getenv('VERCEL_API_KEY')
if not api_key:
raise ValueError('VERCEL_API_KEY is not set')
# Use Vercel AI Gateway
# Basic usage
llm = ChatVercel(
model='openai/gpt-4o',
api_key=api_key,
)
# With provider options - control which providers are used and in what order
# This will try Vertex AI first, then fall back to Anthropic if Vertex fails
llm_with_provider_options = ChatVercel(
model='anthropic/claude-sonnet-4',
api_key=api_key,
provider_options={
'gateway': {
'order': ['vertex', 'anthropic'] # Try Vertex AI first, then Anthropic
}
},
)
agent = Agent(
task="Your task here",
llm=llm

View File

@@ -24,19 +24,38 @@ api_key = os.getenv('VERCEL_API_KEY')
if not api_key:
raise ValueError('VERCEL_API_KEY is not set')
# Basic usage
llm = ChatVercel(
model='openai/gpt-4o',
api_key=api_key,
)
# Example with provider options - control which providers are used and in what order
# This will try Vertex AI first, then fall back to Anthropic if Vertex fails
llm_with_provider_options = ChatVercel(
model='anthropic/claude-sonnet-4',
api_key=api_key,
provider_options={
'gateway': {
'order': ['vertex', 'anthropic'] # Try Vertex AI first, then Anthropic
}
},
)
agent = Agent(
task='Go to example.com and summarize the main content',
llm=llm,
)
agent_with_provider_options = Agent(
task='Go to example.com and summarize the main content',
llm=llm_with_provider_options,
)
async def main():
await agent.run(max_steps=10)
await agent_with_provider_options.run(max_steps=10)
if __name__ == '__main__':

View File

@@ -2,7 +2,7 @@
name = "browser-use"
description = "Make websites accessible for AI agents"
authors = [{ name = "Gregor Zunic" }]
version = "0.9.7"
version = "0.10.1"
readme = "README.md"
requires-python = ">=3.11,<4.0"
classifiers = [
@@ -100,7 +100,7 @@ build-backend = "hatchling.build"
[tool.codespell]
ignore-words-list = "bu,wit,dont,cant,wont,re-use,re-used,re-using,re-usable,thats,doesnt,doubleclick,finaly,finalY,initialY"
ignore-words-list = "bu,wit,dont,cant,wont,re-use,re-used,re-using,re-usable,thats,doesnt,doubleclick,finaly,finalY"
skip = "*.json"
[tool.ruff]
@@ -166,9 +166,6 @@ include = [
"browser_use/agent/system_prompt_no_thinking.md",
"browser_use/agent/system_prompt_flash.md",
"browser_use/agent/system_prompt_flash_anthropic.md",
"browser-use/agent/system_prompt_anthropic_no_thinking.md",
"browser_use/agent/system_prompt_flash_anthropic.md",
"browser-use/agent/system_prompt_anthropic.md",
"browser_use/code_use/system_prompt.md",
"browser_use/cli_templates/*.py",
"browser_use/py.typed",