Merge branch 'main' into feature/mistral-support

2026-05-13 17:56:35 +02:00 · 2025-11-29 14:30:08 -08:00
parent d58ae80b50 6f403e5f32
commit 38b3bb2edd
24 changed files with 929 additions and 2110 deletions
--- a/README.md
+++ b/README.md
@@ -38,9 +38,11 @@

 </br>

+🌤️ Want to skip the setup? Use our <b>[cloud](https://cloud.browser-use.com)</b> for faster, scalable, stealth-enabled browser automation!
+
 # 🤖 LLM Quickstart

-1. Direct your favorite coding agent (Cursor, ClaudeS, etc) to [Agents.md](https://docs.browser-use.com/llms-full.txt)
+1. Direct your favorite coding agent (Cursor, Claude Code, etc) to [Agents.md](https://docs.browser-use.com/llms-full.txt)
 2. Prompt away!

 <br/>
--- a/browser_use/agent/message_manager/views.py
+++ b/browser_use/agent/message_manager/views.py
@@ -32,7 +32,7 @@ class HistoryItem(BaseModel):

 	def to_string(self) -> str:
 		"""Get string representation of the history item"""
-		step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
+		step_str = 'step' if self.step_number is not None else 'step_unknown'

 		if self.error:
 			return f"""<{step_str}>
--- a/browser_use/agent/prompts.py
+++ b/browser_use/agent/prompts.py
@@ -43,13 +43,8 @@ class SystemPrompt:
 		"""Load the prompt template from the markdown file."""
 		try:
 			# Choose the appropriate template based on flash_mode, use_thinking, and is_anthropic
-			if self.is_anthropic:
-				if self.flash_mode:
-					template_filename = 'system_prompt_flash_anthropic.md'
-				elif self.use_thinking:
-					template_filename = 'system_prompt_anthropic.md'
-				else:
-					template_filename = 'system_prompt_no_thinking.md'
+			if self.flash_mode and self.is_anthropic:
+				template_filename = 'system_prompt_flash_anthropic.md'
 			elif self.flash_mode:
 				template_filename = 'system_prompt_flash.md'
 			elif self.use_thinking:
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -213,16 +213,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		if llm.provider == 'browser-use':
 			flash_mode = True

-		# Auto-configure llm_screenshot_size for specific models
+		# Auto-configure llm_screenshot_size for Claude Sonnet models
 		if llm_screenshot_size is None:
 			model_name = getattr(llm, 'model', '')
-			if isinstance(model_name, str):
-				if model_name.startswith('claude-sonnet'):
-					llm_screenshot_size = (1400, 850)
-					logger.info('🖼️  Auto-configured LLM screenshot size for Claude Sonnet: 1400x850')
-				elif 'gemini' in model_name.lower():
-					llm_screenshot_size = (1024, 720)
-					logger.info('🖼️  Auto-configured LLM screenshot size for Gemini: 1024x720')
+			if isinstance(model_name, str) and model_name.startswith('claude-sonnet'):
+				llm_screenshot_size = (1400, 850)
+				logger.info('🖼️  Auto-configured LLM screenshot size for Claude Sonnet: 1400x850')

 		if page_extraction_llm is None:
 			page_extraction_llm = llm
@@ -934,6 +930,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):

 		# Log step completion summary
 		summary_message = self._log_step_completion_summary(self.step_start_time, self.state.last_result)
+		if summary_message:
+			await self._demo_mode_log(summary_message, 'info', {'step': self.state.n_steps})

 		# Save file system state after step completion
 		self.save_file_system_state()
@@ -1772,6 +1770,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		if on_step_start is not None:
 			await on_step_start(self)

+		await self._demo_mode_log(
+			f'Starting step {step + 1}/{max_steps}',
+			'info',
+			{'step': step + 1, 'total_steps': max_steps},
+		)
+
 		self.logger.debug(f'🚶 Starting step {step + 1}/{max_steps}...')

 		try:
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -178,7 +178,7 @@ You must ALWAYS respond with a valid JSON in this exact format:
  "thinking": "A structured <think>-style reasoning block that applies the <reasoning_rules> provided above.",
  "evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
  "memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
-  "next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.",
+  "next_goal": "State the next immediate goal and action to achieve it, in one clear sentence."
  "action":[{{"navigate": {{ "url": "url_value"}}}}, // ... more actions in sequence]
 }}
 Action list should NEVER be empty.
--- a/browser_use/agent/system_prompt_anthropic.md
+++ b/browser_use/agent/system_prompt_anthropic.md
@@ -1,34 +0,0 @@
-You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
-<user_request>
-User request is the ultimate objective. For tasks with specific instructions, follow each step. For open-ended tasks, plan your own approach.
-</user_request>
-<browser_state>
-Elements: [index]<type>text</type>. Only [indexed] are interactive. Indentation=child. *[=new.
-</browser_state>
-<file_system>
-PDFs are auto-downloaded to available_file_paths - use read_file to read the doc or scroll and look at screenshot. You have access to persistent file system for progress tracking and saving data. Long tasks >10 steps: use todo.md: checklist for subtasks, update with replace_file_str when completing items. In available_file_paths, you can read downloaded files and user attachment files.
-</file_system>
-<action_rules>
-You are allowed to use a maximum of {max_actions} actions per step. Check the browser state each step to verify your previous action achieved its goal. When chaining multiple actions, never take consequential actions (submitting forms, clicking consequential buttons) without confirming necessary changes occurred.
-
-Default to element indices for browser interaction. If the target index is missing or a prior index-based action failed, use screenshot coordinates instead—DOM extraction doesn't capture everything. Coordinate interaction is useful when DOM extraction fails such as interacting with Canvas, scrolling on sidebars, etc.
-</action_rules>
-<output>You must call the AgentOutput tool with the following schema for the arguments:
-
-{{
-  "thinking": "A structured <think>-style reasoning block to analyze the current state, agent history, and plan the next goals. Analyze what happened in the last few steps.",
-  "evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
-  "memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
-  "next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.",
-  "action": [
-    {{
-      "action_name": {{
-        "parameter1": "value1",
-        "parameter2": "value2"
-      }}
-    }}
-  ]
-}}
-
-Action list should NEVER be empty.
-</output>
--- a/browser_use/agent/system_prompt_anthropic_no_thinking.md
+++ b/browser_use/agent/system_prompt_anthropic_no_thinking.md
@@ -1,33 +0,0 @@
-You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
-<user_request>
-User request is the ultimate objective. For tasks with specific instructions, follow each step. For open-ended tasks, plan your own approach.
-</user_request>
-<browser_state>
-Elements: [index]<type>text</type>. Only [indexed] are interactive. Indentation=child. *[=new.
-</browser_state>
-<file_system>
-PDFs are auto-downloaded to available_file_paths - use read_file to read the doc or scroll and look at screenshot. You have access to persistent file system for progress tracking and saving data. Long tasks >10 steps: use todo.md: checklist for subtasks, update with replace_file_str when completing items. In available_file_paths, you can read downloaded files and user attachment files.
-</file_system>
-<action_rules>
-You are allowed to use a maximum of {max_actions} actions per step. Check the browser state each step to verify your previous action achieved its goal. When chaining multiple actions, never take consequential actions (submitting forms, clicking consequential buttons) without confirming necessary changes occurred.
-
-Default to element indices for browser interaction. If the target index is missing or a prior index-based action failed, use screenshot coordinates instead—DOM extraction doesn't capture everything. Coordinate interaction is useful when DOM extraction fails such as interacting with Canvas, scrolling on sidebars, etc.
-</action_rules>
-<output>You must call the AgentOutput tool with the following schema for the arguments:
-
-{{
-  "evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
-  "memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
-  "next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.",
-  "action": [
-    {{
-      "action_name": {{
-        "parameter1": "value1",
-        "parameter2": "value2"
-      }}
-    }}
-  ]
-}}
-
-Action list should NEVER be empty.
-</output>
--- a/browser_use/agent/system_prompt_flash_anthropic.md
+++ b/browser_use/agent/system_prompt_flash_anthropic.md
@@ -10,8 +10,6 @@ PDFs are auto-downloaded to available_file_paths - use read_file to read the doc
 </file_system>
 <action_rules>
 You are allowed to use a maximum of {max_actions} actions per step. Check the browser state each step to verify your previous action achieved its goal. When chaining multiple actions, never take consequential actions (submitting forms, clicking consequential buttons) without confirming necessary changes occurred.
-
-Default to element indices for browser interaction. If the target index is missing or a prior index-based action failed, use screenshot coordinates instead—DOM extraction doesn't capture everything. Coordinate interaction is useful when DOM extraction fails such as interacting with Canvas, scrolling on sidebars, etc.
 </action_rules>
 <output>You must call the AgentOutput tool with the following schema for the arguments:

--- a/browser_use/browser/demo_mode.py
+++ b/browser_use/browser/demo_mode.py
--- a/browser_use/browser/demo_panel_scripts.py
+++ b/browser_use/browser/demo_panel_scripts.py
--- a/browser_use/browser/events.py
+++ b/browser_use/browser/events.py
@@ -166,17 +166,6 @@ class ScrollEvent(ElementSelectedEvent[None]):
 	event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_ScrollEvent', 8.0))  # seconds


-class ScrollAtCoordinateEvent(BaseEvent[None]):
-	"""Scroll at specific coordinates using mouse wheel."""
-
-	coordinate_x: int
-	coordinate_y: int
-	scroll_x: int = 0  # deltaX (positive=right, negative=left)
-	scroll_y: int = 0  # deltaY (positive=down, negative=up)
-
-	event_timeout: float | None = _get_timeout('TIMEOUT_ScrollAtCoordinateEvent', 8.0)  # seconds
-
-
 class SwitchTabEvent(BaseEvent[TargetID]):
 	"""Switch to a different tab."""

--- a/browser_use/browser/profile.py
+++ b/browser_use/browser/profile.py
@@ -595,10 +595,6 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
 		default=False,
 		description='Enable demo mode side panel that streams agent logs directly inside the browser window (requires headless=False).',
 	)
-	demo_mode_display: Literal['full', 'last'] = Field(
-		default='last',
-		description="Display mode for demo panel: 'full' shows complete log panel, 'last' shows only latest action and memory in bottom-right corner",
-	)
 	cookie_whitelist_domains: list[str] = Field(
 		default_factory=lambda: ['nature.com', 'qatarairways.com'],
 		description='List of domains to whitelist in the "I still don\'t care about cookies" extension, preventing automatic cookie banner handling on these sites.',
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -436,7 +436,6 @@ class BrowserSession(BaseModel):

 	_cloud_browser_client: CloudBrowserClient = PrivateAttr(default_factory=lambda: CloudBrowserClient())
 	_demo_mode: 'DemoMode | None' = PrivateAttr(default=None)
-	_demo_nav_handler_event_bus: EventBus | None = PrivateAttr(default=None)

 	_logger: Any = PrivateAttr(default=None)

@@ -518,13 +517,15 @@ class BrowserSession(BaseModel):
 			self._demo_mode.reset()
 			self._demo_mode = None

-		self.logger.debug('✅ Browser session reset complete')
+		self.logger.info('✅ Browser session reset complete')

 	def model_post_init(self, __context) -> None:
 		"""Register event handlers after model initialization."""
 		self._connection_lock = asyncio.Lock()

 		# Check if handlers are already registered to prevent duplicates
+		from browser_use.browser.watchdog_base import BaseWatchdog
+
 		start_handlers = self.event_bus.handlers.get('BrowserStartEvent', [])
 		start_handler_names = [getattr(h, '__name__', str(h)) for h in start_handlers]

@@ -535,16 +536,9 @@ class BrowserSession(BaseModel):
 				'This likely means BrowserSession was initialized multiple times with the same EventBus.'
 			)

-		self._register_essential_handlers()
-
-	def _register_essential_handlers(self) -> None:
-		"""Register all essential event handlers on the current event bus."""
-		from browser_use.browser.watchdog_base import BaseWatchdog
-
 		BaseWatchdog.attach_handler_to_session(self, BrowserStartEvent, self.on_BrowserStartEvent)
 		BaseWatchdog.attach_handler_to_session(self, BrowserStopEvent, self.on_BrowserStopEvent)
 		BaseWatchdog.attach_handler_to_session(self, NavigateToUrlEvent, self.on_NavigateToUrlEvent)
-		self._ensure_demo_mode_handlers()
 		BaseWatchdog.attach_handler_to_session(self, SwitchTabEvent, self.on_SwitchTabEvent)
 		BaseWatchdog.attach_handler_to_session(self, TabCreatedEvent, self.on_TabCreatedEvent)
 		BaseWatchdog.attach_handler_to_session(self, TabClosedEvent, self.on_TabClosedEvent)
@@ -552,14 +546,6 @@ class BrowserSession(BaseModel):
 		BaseWatchdog.attach_handler_to_session(self, FileDownloadedEvent, self.on_FileDownloadedEvent)
 		BaseWatchdog.attach_handler_to_session(self, CloseTabEvent, self.on_CloseTabEvent)

-	def _ensure_demo_mode_handlers(self) -> None:
-		"""Ensure demo mode handlers are attached to the active event bus."""
-		if self._demo_nav_handler_event_bus is self.event_bus:
-			return
-
-		self.event_bus.on(NavigationCompleteEvent, self._on_demo_mode_navigation_complete)
-		self._demo_nav_handler_event_bus = self.event_bus
-
 	@observe_debug(ignore_input=True, ignore_output=True, name='browser_session_start')
 	async def start(self) -> None:
 		"""Start the browser session."""
@@ -586,8 +572,6 @@ class BrowserSession(BaseModel):
 		await self.reset()
 		# Create fresh event bus
 		self.event_bus = EventBus()
-		# Re-register all essential handlers on the new event bus
-		self._register_essential_handlers()

 	async def stop(self) -> None:
 		"""Stop the browser session without killing the browser process.
@@ -612,8 +596,6 @@ class BrowserSession(BaseModel):
 		await self.reset()
 		# Create fresh event bus
 		self.event_bus = EventBus()
-		# Re-register all essential handlers on the new event bus
-		self._register_essential_handlers()

 	@observe_debug(ignore_input=True, ignore_output=True, name='browser_start_event_handler')
 	async def on_BrowserStartEvent(self, event: BrowserStartEvent) -> dict[str, str]:
@@ -904,20 +886,6 @@ class BrowserSession(BaseModel):
 		else:
 			self.logger.warning(f'⚠️ Page readiness timeout ({timeout}s, {duration_ms:.0f}ms) for {url}')

-	async def _on_demo_mode_navigation_complete(self, event: NavigationCompleteEvent) -> None:
-		"""Rehydrate the demo overlay and logs after navigation."""
-		if not self.browser_profile.demo_mode:
-			return
-
-		demo = self.demo_mode
-		if not demo:
-			return
-
-		try:
-			await demo.refresh_target(event.target_id)
-		except Exception as exc:
-			self.logger.debug(f'[DemoMode] Failed to refresh overlay for target {event.target_id[:8]}...: {exc}')
-
 	async def on_SwitchTabEvent(self, event: SwitchTabEvent) -> TargetID:
 		"""Handle tab switching - core browser functionality."""
 		if not self.agent_focus_target_id:
@@ -1075,7 +1043,7 @@ class BrowserSession(BaseModel):
 					self.logger.debug(f'Failed to cleanup cloud browser session: {e}')

 			# Clear CDP session cache before stopping
-			self.logger.debug(
+			self.logger.info(
 				f'📢 on_BrowserStopEvent - Calling reset() (force={event.force}, keep_alive={self.browser_profile.keep_alive})'
 			)
 			await self.reset()
@@ -2957,19 +2925,19 @@ class BrowserSession(BaseModel):
 		"""Clear geolocation override using CDP."""
 		await self.cdp_client.send.Emulation.clearGeolocationOverride()

-	async def _cdp_add_init_script(self, script: str, target_id: TargetID | None = None) -> str:
-		"""Add script to evaluate on new document for a specific target."""
+	async def _cdp_add_init_script(self, script: str) -> str:
+		"""Add script to evaluate on new document using CDP Page.addScriptToEvaluateOnNewDocument."""
 		assert self._cdp_client_root is not None
-		cdp_session = await self.get_or_create_cdp_session(target_id=target_id, focus=target_id is None)
+		cdp_session = await self.get_or_create_cdp_session()

 		result = await cdp_session.cdp_client.send.Page.addScriptToEvaluateOnNewDocument(
 			params={'source': script, 'runImmediately': True}, session_id=cdp_session.session_id
 		)
 		return result['identifier']

-	async def _cdp_remove_init_script(self, identifier: str, target_id: TargetID | None = None) -> None:
-		"""Remove script added with addScriptToEvaluateOnNewDocument for a target."""
-		cdp_session = await self.get_or_create_cdp_session(target_id=target_id, focus=target_id is None)
+	async def _cdp_remove_init_script(self, identifier: str) -> None:
+		"""Remove script added with addScriptToEvaluateOnNewDocument."""
+		cdp_session = await self.get_or_create_cdp_session(target_id=None)
 		await cdp_session.cdp_client.send.Page.removeScriptToEvaluateOnNewDocument(
 			params={'identifier': identifier}, session_id=cdp_session.session_id
 		)
--- a/browser_use/browser/session_manager.py
+++ b/browser_use/browser/session_manager.py
@@ -181,7 +181,7 @@ class SessionManager:
 			self._target_sessions.clear()
 			self._session_to_target.clear()

-		self.logger.debug('[SessionManager] Cleared all owned data (targets, sessions, mappings)')
+		self.logger.info('[SessionManager] Cleared all owned data (targets, sessions, mappings)')

 	async def is_target_valid(self, target_id: TargetID) -> bool:
 		"""Check if a target is still valid and has active sessions.
@@ -458,14 +458,6 @@ class SessionManager:
 			except Exception as e:
 				self.logger.warning(f'[SessionManager] Failed to resume execution: {e}')

-		if target_type in ('page', 'tab') and self.browser_session.browser_profile.demo_mode:
-			demo = self.browser_session.demo_mode
-			if demo:
-				try:
-					await demo.register_new_target(target_id)
-				except Exception as exc:
-					self.logger.debug(f'[SessionManager] Failed to register demo overlay for {target_id[:8]}...: {exc}')
-
 	async def _handle_target_info_changed(self, event: dict) -> None:
 		"""Handle Target.targetInfoChanged event.

@@ -478,30 +470,13 @@ class SessionManager:
 		if not target_id:
 			return

-		url_changed = False
-		target_type = None
-
 		async with self._lock:
 			# Update target if it exists (source of truth for url/title)
 			if target_id in self._targets:
 				target = self._targets[target_id]
-				target_type = target.target_type
-				previous_url = target.url
-				new_url = target_info.get('url', previous_url)

 				target.title = target_info.get('title', target.title)
-				target.url = new_url
-				url_changed = previous_url != new_url
-
-		if url_changed and target_type in ('page', 'tab') and self.browser_session.browser_profile.demo_mode:
-			demo = self.browser_session.demo_mode
-			if demo:
-				try:
-					await demo.refresh_target(target_id)
-				except Exception as exc:
-					self.logger.debug(
-						f'[SessionManager] Failed to refresh demo overlay after URL change for {target_id[:8]}...: {exc}'
-					)
+				target.url = target_info.get('url', target.url)

 	async def _handle_target_detached(self, event: DetachedFromTargetEvent) -> None:
 		"""Handle Target.detachedFromTarget event.
@@ -592,9 +567,6 @@ class SessionManager:

 				self.browser_session.event_bus.dispatch(TabClosedEvent(target_id=target_id))
 				self.logger.debug(f'[SessionManager] Dispatched TabClosedEvent for page target {target_id[:8]}...')
-				demo = self.browser_session.demo_mode
-				if demo:
-					demo.unregister_target(target_id)
 			elif target_type:
 				self.logger.debug(
 					f'[SessionManager] Target {target_id[:8]}... fully removed (type={target_type}) - not dispatching TabClosedEvent'
--- a/browser_use/browser/watchdogs/default_action_watchdog.py
+++ b/browser_use/browser/watchdogs/default_action_watchdog.py
@@ -13,7 +13,6 @@ from browser_use.browser.events import (
 	GoBackEvent,
 	GoForwardEvent,
 	RefreshEvent,
-	ScrollAtCoordinateEvent,
 	ScrollEvent,
 	ScrollToTextEvent,
 	SelectDropdownOptionEvent,
@@ -384,46 +383,6 @@ class DefaultActionWatchdog(BaseWatchdog):
 		except Exception as e:
 			raise

-	async def on_ScrollAtCoordinateEvent(self, event: ScrollAtCoordinateEvent) -> None:
-		"""Handle scroll at specific coordinates using CDP synthesizeScrollGesture."""
-		# Check if we have a current target for scrolling
-		if not self.browser_session.agent_focus_target_id:
-			error_msg = 'No active target for scrolling'
-			raise BrowserError(error_msg)
-
-		try:
-			# Get focused CDP session
-			cdp_session = await self.browser_session.get_or_create_cdp_session()
-			cdp_client = cdp_session.cdp_client
-			session_id = cdp_session.session_id
-
-			# Convert scroll deltas to gesture distances
-			# Note: synthesizeScrollGesture uses opposite directions:
-			# - positive yDistance = scroll UP (opposite of mouseWheel deltaY)
-			# - positive xDistance = scroll LEFT (opposite of mouseWheel deltaX)
-			# So we negate the values to maintain the same behavior as before
-			params: dict[str, float] = {
-				'x': float(event.coordinate_x),
-				'y': float(event.coordinate_y),
-			}
-			if event.scroll_x != 0:
-				params['xDistance'] = float(-event.scroll_x)
-			if event.scroll_y != 0:
-				params['yDistance'] = float(-event.scroll_y)
-
-			# Synthesize scroll gesture at the specified coordinates
-			await cdp_client.send.Input.synthesizeScrollGesture(
-				params=params,  # type: ignore[arg-type]
-				session_id=session_id,
-			)
-
-			self.logger.debug(
-				f'📄 Scrolled at ({event.coordinate_x}, {event.coordinate_y}) by deltaX={event.scroll_x}, deltaY={event.scroll_y}'
-			)
-			return None
-		except Exception as e:
-			raise
-
 	# ========== Implementation Methods ==========

 	async def _check_element_occlusion(self, backend_node_id: int, x: float, y: float, cdp_session) -> bool:
@@ -961,6 +920,8 @@ class DefaultActionWatchdog(BaseWatchdog):
 						},
 						session_id=cdp_session.session_id,
 					)
+				# Add 10ms delay between keystrokes
+				await asyncio.sleep(0.010)
 		except Exception as e:
 			raise Exception(f'Failed to type to page: {str(e)}')

@@ -2263,6 +2224,9 @@ class DefaultActionWatchdog(BaseWatchdog):
 							session_id=cdp_session.session_id,
 						)

+						# Small delay between characters (10ms)
+						await asyncio.sleep(0.010)
+
 			self.logger.info(f'⌨️ Sent keys: {event.keys}')

 			# Note: We don't clear cached state on Enter; multi_act will detect DOM changes
--- a/browser_use/browser/watchdogs/screenshot_watchdog.py
+++ b/browser_use/browser/watchdogs/screenshot_watchdog.py
@@ -50,7 +50,7 @@ class ScreenshotWatchdog(BaseWatchdog):
 					raise BrowserError('[ScreenshotWatchdog] No page targets available for screenshot')
 				target_id = page_targets[-1].target_id

-			cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
+			cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=True)

 			# Prepare screenshot parameters
 			params = CaptureScreenshotParameters(format='png', captureBeyondViewport=False)
--- a/browser_use/code_use/service.py
+++ b/browser_use/code_use/service.py
@@ -295,6 +295,7 @@ class CodeAgent:
 		# Main execution loop
 		for step in range(self.max_steps):
 			logger.info(f'\n\n\n\n\n\n\nStep {step + 1}/{self.max_steps}')
+			await self._demo_mode_log(f'Starting step {step + 1}/{self.max_steps}', 'info', {'step': step + 1})

 			# Start timing this step
 			self._step_start_time = datetime.datetime.now().timestamp()
--- a/browser_use/llm/vercel/chat.py
+++ b/browser_use/llm/vercel/chat.py
@@ -189,6 +189,8 @@ class ChatVercel(BaseChatModel):
 	        prompt-based JSON extraction. Auto-detects common reasoning models by default.
 	    timeout: Request timeout in seconds
 	    max_retries: Maximum number of retries for failed requests
+	    provider_options: Provider routing options for the gateway. Use this to control which
+	        providers are used and in what order. Example: {'gateway': {'order': ['vertex', 'anthropic']}}
 	"""

 	# Model configuration
@@ -218,6 +220,7 @@ class ChatVercel(BaseChatModel):
 	default_query: Mapping[str, object] | None = None
 	http_client: httpx.AsyncClient | None = None
 	_strict_response_validation: bool = False
+	provider_options: dict[str, Any] | None = None

 	# Static
 	@property
@@ -382,6 +385,8 @@ class ChatVercel(BaseChatModel):
 				model_params['max_tokens'] = self.max_tokens
 			if self.top_p is not None:
 				model_params['top_p'] = self.top_p
+			if self.provider_options:
+				model_params['extra_body'] = {'providerOptions': self.provider_options}

 			if output_format is None:
 				# Return string response
@@ -400,11 +405,12 @@ class ChatVercel(BaseChatModel):

 			else:
 				is_google_model = self.model.startswith('google/')
+				is_anthropic_model = self.model.startswith('anthropic/')
 				is_reasoning_model = self.reasoning_models and any(
 					str(pattern).lower() in str(self.model).lower() for pattern in self.reasoning_models
 				)

-				if is_google_model or is_reasoning_model:
+				if is_google_model or is_anthropic_model or is_reasoning_model:
 					modified_messages = [m.model_copy(deep=True) for m in messages]

 					schema = SchemaOptimizer.create_gemini_optimized_schema(output_format)
@@ -431,10 +437,14 @@ class ChatVercel(BaseChatModel):

 					vercel_messages = VercelMessageSerializer.serialize_messages(modified_messages)

+					request_params = model_params.copy()
+					if self.provider_options:
+						request_params['extra_body'] = {'providerOptions': self.provider_options}
+
 					response = await self.get_client().chat.completions.create(
 						model=self.model,
 						messages=vercel_messages,
-						**model_params,
+						**request_params,
 					)

 					content = response.choices[0].message.content if response.choices else None
@@ -479,6 +489,10 @@ class ChatVercel(BaseChatModel):
 						'schema': schema,
 					}

+					request_params = model_params.copy()
+					if self.provider_options:
+						request_params['extra_body'] = {'providerOptions': self.provider_options}
+
 					response = await self.get_client().chat.completions.create(
 						model=self.model,
 						messages=vercel_messages,
@@ -486,7 +500,7 @@ class ChatVercel(BaseChatModel):
 							json_schema=response_format_schema,
 							type='json_schema',
 						),
-						**model_params,
+						**request_params,
 					)

 					content = response.choices[0].message.content if response.choices else None
--- a/browser_use/tools/service.py
+++ b/browser_use/tools/service.py
@@ -19,7 +19,6 @@ from browser_use.browser.events import (
 	GetDropdownOptionsEvent,
 	GoBackEvent,
 	NavigateToUrlEvent,
-	ScrollAtCoordinateEvent,
 	ScrollEvent,
 	ScrollToTextEvent,
 	SendKeysEvent,
@@ -45,7 +44,6 @@ from browser_use.tools.views import (
 	NavigateAction,
 	NoParamsAction,
 	ScrollAction,
-	ScrollAtCoordinateAction,
 	SearchAction,
 	SelectDropdownOptionAction,
 	SendKeysAction,
@@ -62,7 +60,6 @@ logger = logging.getLogger(__name__)
 ClickElementEvent.model_rebuild()
 TypeTextEvent.model_rebuild()
 ScrollEvent.model_rebuild()
-ScrollAtCoordinateEvent.model_rebuild()
 UploadFileEvent.model_rebuild()

 Context = TypeVar('Context')
@@ -254,25 +251,6 @@ class Tools(Generic[Context]):
 				return actual_x, actual_y
 			return llm_x, llm_y

-		def _convert_llm_scroll_deltas_to_viewport(
-			llm_scroll_x: int, llm_scroll_y: int, browser_session: BrowserSession
-		) -> tuple[int, int]:
-			"""Convert scroll deltas from LLM screenshot size to original viewport size."""
-			if browser_session.llm_screenshot_size and browser_session._original_viewport_size:
-				original_width, original_height = browser_session._original_viewport_size
-				llm_width, llm_height = browser_session.llm_screenshot_size
-
-				# Scale scroll deltas using the same ratio as coordinates
-				actual_scroll_x = int((llm_scroll_x / llm_width) * original_width)
-				actual_scroll_y = int((llm_scroll_y / llm_height) * original_height)
-
-				logger.info(
-					f'🔄 Scaling scroll deltas: LLM ({llm_scroll_x}, {llm_scroll_y}) @ {llm_width}x{llm_height} '
-					f'→ Viewport ({actual_scroll_x}, {actual_scroll_y}) @ {original_width}x{original_height}'
-				)
-				return actual_scroll_x, actual_scroll_y
-			return llm_scroll_x, llm_scroll_y
-
 		# Element Interaction Actions
 		async def _click_by_coordinate(params: ClickElementAction, browser_session: BrowserSession) -> ActionResult:
 			# Ensure coordinates are provided (type safety)
@@ -374,7 +352,7 @@ class Tools(Generic[Context]):
 				return ActionResult(error=error_msg)

 		@self.registry.action(
-			'Click element by index or coordinates',
+			'Click element by index or coordinates. Prefer index over coordinates when possible. Either provide coordinates or index.',
 			param_model=ClickElementAction,
 		)
 		async def click(params: ClickElementAction, browser_session: BrowserSession):
@@ -390,7 +368,7 @@ class Tools(Generic[Context]):
 				return await _click_by_coordinate(params, browser_session)

 		@self.registry.action(
-			'Input text into element with index.',
+			'Input text into element with index. Only works with index, NEVER use coordinates for inputting text.',
 			param_model=InputTextAction,
 		)
 		async def input(
@@ -810,14 +788,25 @@ You will be given a query and the markdown of a webpage that has been filtered t
 				raise RuntimeError(str(e))

 		@self.registry.action(
-			"""Scroll by pages where one page = viewport height. Set down=True to scroll down, down=False to scroll up. Defaults to scrolling down one page.""",
+			"""Scroll by pages. REQUIRED: down=True/False (True=scroll down, False=scroll up, default=True). Optional: pages=0.5-10.0 (default 1.0). Use index for scroll containers (dropdowns/custom UI). High pages (10) reaches bottom. Multi-page scrolls sequentially. Viewport-based height, fallback 1000px/page.""",
 			param_model=ScrollAction,
 		)
 		async def scroll(params: ScrollAction, browser_session: BrowserSession):
 			try:
-				direction = 'down' if params.down else 'up'
+				# Look up the node from the selector map if index is provided
+				# Special case: index 0 means scroll the whole page (root/body element)
+				node = None
+				if params.index is not None and params.index != 0:
+					node = await browser_session.get_element_by_index(params.index)
+					if node is None:
+						# Element does not exist
+						msg = f'Element index {params.index} not found in browser state'
+						return ActionResult(error=msg)

-				# Get actual viewport height for scrolling
+				direction = 'down' if params.down else 'up'
+				target = f'element {params.index}' if params.index is not None and params.index != 0 else ''
+
+				# Get actual viewport height for more accurate scrolling
 				try:
 					cdp_session = await browser_session.get_or_create_cdp_session()
 					metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
@@ -834,12 +823,6 @@ You will be given a query and the markdown of a webpage that has been filtered t
 					viewport_height = 1000  # Fallback to 1000px
 					logger.debug(f'Failed to get viewport height, using fallback 1000px: {e}')

-				# For reporting to LLM, use LLM screenshot height if available (so LLM's mental model matches)
-				if browser_session.llm_screenshot_size:
-					_, llm_viewport_height = browser_session.llm_screenshot_size
-				else:
-					llm_viewport_height = viewport_height
-
 				# For multiple pages (>=1.0), scroll one page at a time to ensure each scroll completes
 				if params.pages >= 1.0:
 					import asyncio
@@ -857,7 +840,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
 								pixels = -pixels

 							event = browser_session.event_bus.dispatch(
-								ScrollEvent(direction=direction, amount=abs(pixels), node=None)
+								ScrollEvent(direction=direction, amount=abs(pixels), node=node)
 							)
 							await event
 							await event.event_result(raise_if_any=True, raise_if_none=False)
@@ -878,7 +861,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
 								pixels = -pixels

 							event = browser_session.event_bus.dispatch(
-								ScrollEvent(direction=direction, amount=abs(pixels), node=None)
+								ScrollEvent(direction=direction, amount=abs(pixels), node=node)
 							)
 							await event
 							await event.event_result(raise_if_any=True, raise_if_none=False)
@@ -888,19 +871,18 @@ You will be given a query and the markdown of a webpage that has been filtered t
 							logger.warning(f'Fractional scroll failed: {e}')

 					if params.pages == 1.0:
-						long_term_memory = f'Scrolled {direction} {llm_viewport_height}px'
+						long_term_memory = f'Scrolled {direction} {target} {viewport_height}px'.replace('  ', ' ')
 					else:
-						long_term_memory = f'Scrolled {direction} {completed_scrolls:.1f} pages'
+						long_term_memory = f'Scrolled {direction} {target} {completed_scrolls:.1f} pages'.replace('  ', ' ')
 				else:
 					# For fractional pages <1.0, do single scroll
 					pixels = int(params.pages * viewport_height)
 					event = browser_session.event_bus.dispatch(
-						ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=None)
+						ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node)
 					)
 					await event
 					await event.event_result(raise_if_any=True, raise_if_none=False)
-					llm_pixels = int(params.pages * llm_viewport_height)
-					long_term_memory = f'Scrolled {direction} {llm_pixels}px'
+					long_term_memory = f'Scrolled {direction} {target} {params.pages} pages'.replace('  ', ' ')

 				msg = f'🔍 {long_term_memory}'
 				logger.info(msg)
@@ -910,54 +892,6 @@ You will be given a query and the markdown of a webpage that has been filtered t
 				error_msg = 'Failed to execute scroll action.'
 				return ActionResult(error=error_msg)

-		@self.registry.action(
-			'Scroll at specific coordinates. Use when you need to scroll within a specific area (e.g., scrollable containers, maps, modals) not reachable via element index.',
-			param_model=ScrollAtCoordinateAction,
-		)
-		async def scroll_at_coordinates(params: ScrollAtCoordinateAction, browser_session: BrowserSession):
-			try:
-				# Convert coordinates from LLM screenshot size to viewport size
-				actual_x, actual_y = _convert_llm_coordinates_to_viewport(
-					params.coordinate_x, params.coordinate_y, browser_session
-				)
-
-				# Convert scroll deltas from LLM screenshot size to viewport size
-				actual_scroll_x, actual_scroll_y = _convert_llm_scroll_deltas_to_viewport(
-					params.scroll_x, params.scroll_y, browser_session
-				)
-
-				# Dispatch scroll at coordinate event
-				event = browser_session.event_bus.dispatch(
-					ScrollAtCoordinateEvent(
-						coordinate_x=actual_x,
-						coordinate_y=actual_y,
-						scroll_x=actual_scroll_x,
-						scroll_y=actual_scroll_y,
-					)
-				)
-				await event
-				await event.event_result(raise_if_any=True, raise_if_none=False)
-
-				# Build memory with scroll amounts
-				direction_parts = []
-				if params.scroll_y > 0:
-					direction_parts.append(f'down {params.scroll_y}px')
-				elif params.scroll_y < 0:
-					direction_parts.append(f'up {abs(params.scroll_y)}px')
-				if params.scroll_x > 0:
-					direction_parts.append(f'right {params.scroll_x}px')
-				elif params.scroll_x < 0:
-					direction_parts.append(f'left {abs(params.scroll_x)}px')
-				scroll_desc = ' and '.join(direction_parts) if direction_parts else 'zero'
-
-				memory = f'Scrolled {scroll_desc} at ({params.coordinate_x}, {params.coordinate_y})'
-				msg = f'🔍 {memory}'
-				logger.info(msg)
-				return ActionResult(extracted_content=msg, long_term_memory=memory)
-			except Exception as e:
-				logger.error(f'Failed to scroll at coordinates: {type(e).__name__}: {e}')
-				return ActionResult(error=f'Failed to scroll at ({params.coordinate_x}, {params.coordinate_y})')
-
 		@self.registry.action(
 			'',
 			param_model=SendKeysAction,
--- a/browser_use/tools/views.py
+++ b/browser_use/tools/views.py
@@ -73,7 +73,8 @@ class CloseTabAction(BaseModel):

 class ScrollAction(BaseModel):
 	down: bool = Field(default=True, description='down=True=scroll down, down=False scroll up')
-	pages: float = Field(default=1.0, description='0.5=half page, 1=full page, 2=two pages, etc.')
+	pages: float = Field(default=1.0, description='0.5=half page, 1=full page, 10=to bottom/top')
+	index: int | None = Field(default=None, description='Optional element index to scroll within specific container')


 class SendKeysAction(BaseModel):
@@ -96,10 +97,3 @@ class GetDropdownOptionsAction(BaseModel):
 class SelectDropdownOptionAction(BaseModel):
 	index: int
 	text: str = Field(description='exact text/value')
-
-
-class ScrollAtCoordinateAction(BaseModel):
-	coordinate_x: int = Field(description='Horizontal coordinate relative to viewport left edge')
-	coordinate_y: int = Field(description='Vertical coordinate relative to viewport top edge')
-	scroll_x: int = Field(default=0, description='Horizontal scroll delta (positive=right, negative=left)')
-	scroll_y: int = Field(default=0, description='Vertical scroll delta (positive=down, negative=up)')
--- a/docs/quickstart.mdx
+++ b/docs/quickstart.mdx
@@ -143,15 +143,21 @@ Sandboxes are the **easiest way to run Browser-Use in production**. We handle ag
 To run in production with authentication, just add `@sandbox` to your function:

 ```python
+import asyncio
 from browser_use import Browser, sandbox, ChatBrowserUse
 from browser_use.agent.service import Agent

@sandbox(cloud_profile_id='your-profile-id')
 async def production_task(browser: Browser):
-    agent = Agent(task="Your authenticated task", browser=browser, llm=ChatBrowserUse())
+    agent = Agent(
+        task="Your authenticated task",
+        browser=browser,
+        llm=ChatBrowserUse(),
+    )
    await agent.run()

-await production_task()
+if __name__ == "__main__":
+    asyncio.run(production_task())
 ```

 See [Going to Production](/production) for how to sync your cookies to the cloud.
--- a/docs/supported-models.mdx
+++ b/docs/supported-models.mdx
@@ -358,12 +358,24 @@ api_key = os.getenv('VERCEL_API_KEY')
 if not api_key:
    raise ValueError('VERCEL_API_KEY is not set')

-# Use Vercel AI Gateway
+# Basic usage
 llm = ChatVercel(
    model='openai/gpt-4o',
    api_key=api_key,
 )

+# With provider options - control which providers are used and in what order
+# This will try Vertex AI first, then fall back to Anthropic if Vertex fails
+llm_with_provider_options = ChatVercel(
+    model='anthropic/claude-sonnet-4',
+    api_key=api_key,
+    provider_options={
+        'gateway': {
+            'order': ['vertex', 'anthropic']  # Try Vertex AI first, then Anthropic
+        }
+    },
+)
+
 agent = Agent(
    task="Your task here",
    llm=llm
--- a/examples/models/vercel_ai_gateway.py
+++ b/examples/models/vercel_ai_gateway.py
@@ -24,19 +24,38 @@ api_key = os.getenv('VERCEL_API_KEY')
 if not api_key:
 	raise ValueError('VERCEL_API_KEY is not set')

+# Basic usage
 llm = ChatVercel(
 	model='openai/gpt-4o',
 	api_key=api_key,
 )

+# Example with provider options - control which providers are used and in what order
+# This will try Vertex AI first, then fall back to Anthropic if Vertex fails
+llm_with_provider_options = ChatVercel(
+	model='anthropic/claude-sonnet-4',
+	api_key=api_key,
+	provider_options={
+		'gateway': {
+			'order': ['vertex', 'anthropic']  # Try Vertex AI first, then Anthropic
+		}
+	},
+)
+
 agent = Agent(
 	task='Go to example.com and summarize the main content',
 	llm=llm,
 )

+agent_with_provider_options = Agent(
+	task='Go to example.com and summarize the main content',
+	llm=llm_with_provider_options,
+)
+

 async def main():
 	await agent.run(max_steps=10)
+	await agent_with_provider_options.run(max_steps=10)


 if __name__ == '__main__':
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "browser-use"
 description = "Make websites accessible for AI agents"
 authors = [{ name = "Gregor Zunic" }]
-version = "0.9.7"
+version = "0.10.1"
 readme = "README.md"
 requires-python = ">=3.11,<4.0"
 classifiers = [
@@ -100,7 +100,7 @@ build-backend = "hatchling.build"


 [tool.codespell]
-ignore-words-list = "bu,wit,dont,cant,wont,re-use,re-used,re-using,re-usable,thats,doesnt,doubleclick,finaly,finalY,initialY"
+ignore-words-list = "bu,wit,dont,cant,wont,re-use,re-used,re-using,re-usable,thats,doesnt,doubleclick,finaly,finalY"
 skip = "*.json"

 [tool.ruff]
@@ -166,9 +166,6 @@ include = [
    "browser_use/agent/system_prompt_no_thinking.md",
    "browser_use/agent/system_prompt_flash.md",
    "browser_use/agent/system_prompt_flash_anthropic.md",
-    "browser-use/agent/system_prompt_anthropic_no_thinking.md",
-    "browser_use/agent/system_prompt_flash_anthropic.md",
-    "browser-use/agent/system_prompt_anthropic.md",
    "browser_use/code_use/system_prompt.md",
    "browser_use/cli_templates/*.py",
    "browser_use/py.typed",