more playwright tearout

This commit is contained in:
Nick Sweeting
2025-08-06 04:06:34 -07:00
parent 13ce936128
commit 936f7d7d35
8 changed files with 723 additions and 756 deletions

View File

@@ -1251,7 +1251,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Start browser session and attach watchdogs
assert self.browser_session is not None, 'Browser session must be initialized before starting'
self.logger.debug('🌐 Starting browser session...')
await self.browser_session.start()
from browser_use.browser.events import BrowserStartEvent
event = self.browser_session.event_bus.dispatch(BrowserStartEvent())
await event
self.logger.debug('🔧 Browser session started with watchdogs attached')

View File

@@ -33,16 +33,9 @@ class DefaultActionWatchdog(BaseWatchdog):
"""Handle click request with CDP."""
page = await self.browser_session.get_current_page()
try:
# Get the DOM element by index or use provided element_node
if event.element_node is not None:
element_node = event.element_node
# For element_node clicks, we need to get its index for logging
index_for_logging = getattr(element_node, 'highlight_index', 'N/A')
else:
element_node = await self.browser_session.get_dom_element_by_index(event.index)
if element_node is None:
raise Exception(f'Element index {event.index} does not exist - retry or use alternative actions')
index_for_logging = event.index
# Use the provided node
element_node = event.node
index_for_logging = element_node.element_index or 'unknown'
# Track initial number of tabs to detect new tab opening
initial_target_ids = await self.browser_session.target_ids
@@ -94,7 +87,7 @@ class DefaultActionWatchdog(BaseWatchdog):
BrowserErrorEvent(
error_type='ClickFailed',
message=str(e),
details={'index': index_for_logging if 'index_for_logging' in locals() else event.index},
details={'index': index_for_logging if 'index_for_logging' in locals() else 'unknown'},
)
)
@@ -102,23 +95,22 @@ class DefaultActionWatchdog(BaseWatchdog):
"""Handle text input request with CDP."""
page = await self.browser_session.get_current_page()
try:
# Get the DOM element by index
element_node = await self.browser_session.get_dom_element_by_index(event.index)
if element_node is None:
raise Exception(f'Element index {event.index} does not exist - retry or use alternative actions')
# Use the provided node
element_node = event.node
index_for_logging = element_node.element_index or 'unknown'
# Perform the actual text input
await self._input_text_element_node_impl(element_node, event.text, event.clear_existing)
# Log success
logger.info(f'⌨️ Typed "{event.text}" into element with index {event.index}')
logger.info(f'⌨️ Typed "{event.text}" into element with index {index_for_logging}')
logger.debug(f'Element xpath: {element_node.xpath}')
except Exception as e:
self.event_bus.dispatch(
BrowserErrorEvent(
error_type='InputTextFailed',
message=str(e),
details={'index': event.index, 'text': event.text},
details={'index': element_node.element_index or 'unknown', 'text': event.text},
)
)
@@ -140,16 +132,15 @@ class DefaultActionWatchdog(BaseWatchdog):
# Positive pixels = scroll down, negative = scroll up
pixels = event.amount if event.direction == 'down' else -event.amount
# Element-specific scrolling if index is provided
if event.element_index is not None:
element_node = await self.browser_session.get_dom_element_by_index(event.element_index)
if element_node is None:
raise Exception(f'Element index {event.element_index} does not exist')
# Element-specific scrolling if node is provided
if event.node is not None:
element_node = event.node
index_for_logging = element_node.element_index or 'unknown'
# Try to scroll the element's container
success = await self._scroll_element_container(element_node, pixels)
if success:
logger.info(f'📜 Scrolled element {event.element_index} container {event.direction} by {event.amount} pixels')
logger.info(f'📜 Scrolled element {index_for_logging} container {event.direction} by {event.amount} pixels')
return
# Perform page-level scroll
@@ -726,14 +717,13 @@ class DefaultActionWatchdog(BaseWatchdog):
async def on_UploadFileEvent(self, event: UploadFileEvent) -> None:
"""Handle file upload request with CDP."""
try:
# Get the DOM element by index
element_node = await self.browser_session.get_dom_element_by_index(event.element_index)
if element_node is None:
raise Exception(f'Element index {event.element_index} does not exist')
# Use the provided node
element_node = event.node
index_for_logging = element_node.element_index or 'unknown'
# Check if it's a file input
if not self.browser_session.is_file_input(element_node):
raise Exception(f'Element {event.element_index} is not a file input')
raise Exception(f'Element {index_for_logging} is not a file input')
# Get CDP client and session
cdp_client = await self.browser_session.get_cdp_client()
@@ -749,13 +739,13 @@ class DefaultActionWatchdog(BaseWatchdog):
session_id=session_id,
)
logger.info(f'📎 Uploaded file {event.file_path} to element {event.element_index}')
logger.info(f'📎 Uploaded file {event.file_path} to element {index_for_logging}')
except Exception as e:
self.event_bus.dispatch(
BrowserErrorEvent(
error_type='UploadFileFailed',
message=str(e),
details={'element_index': event.element_index, 'file_path': event.file_path},
details={'element_index': element_node.element_index or 'unknown', 'file_path': event.file_path},
)
)

View File

@@ -1,9 +1,12 @@
"""Event definitions for browser communication."""
from typing import Any, Literal
from typing import TYPE_CHECKING, Any, Literal
from bubus import BaseEvent
from pydantic import Field, model_validator
from pydantic import Field
if TYPE_CHECKING:
from browser_use.dom.views import EnhancedDOMTreeNode
# ============================================================================
# Agent/Controller -> BrowserSession Events (High-level browser actions)
@@ -20,29 +23,19 @@ class NavigateToUrlEvent(BaseEvent):
class ClickElementEvent(BaseEvent):
"""Click an element by index or element_node."""
"""Click an element."""
index: int | None = None
element_node: Any | None = None # DOMElementNode, but avoid circular import
node: 'EnhancedDOMTreeNode'
button: Literal['left', 'right', 'middle'] = 'left'
click_count: int = 1
expect_download: bool = False
new_tab: bool = False
@model_validator(mode='after')
def validate_index_or_element_node(self):
"""Validate that either index or element_node is provided."""
if self.index is None and self.element_node is None:
raise ValueError("Either 'index' or 'element_node' must be provided")
if self.index is not None and self.element_node is not None:
raise ValueError("Only one of 'index' or 'element_node' should be provided")
return self
class TypeTextEvent(BaseEvent):
"""Type text into an element."""
index: int
node: 'EnhancedDOMTreeNode'
text: str
clear_existing: bool = True
@@ -52,7 +45,7 @@ class ScrollEvent(BaseEvent):
direction: Literal['up', 'down', 'left', 'right']
amount: int # pixels
element_index: int | None = None # None means scroll page
node: 'EnhancedDOMTreeNode | None' = None # None means scroll page
class SwitchTabEvent(BaseEvent):
@@ -82,13 +75,13 @@ class BrowserStateRequestEvent(BaseEvent):
cache_clickable_elements_hashes: bool = True
class WaitForConditionEvent(BaseEvent):
"""Wait for a condition."""
# class WaitForConditionEvent(BaseEvent):
# """Wait for a condition."""
condition: Literal['navigation', 'selector', 'timeout', 'load_state']
timeout: float = 30000
selector: str | None = None
state: Literal['attached', 'detached', 'visible', 'hidden'] | None = None
# condition: Literal['navigation', 'selector', 'timeout', 'load_state']
# timeout: float = 30000
# selector: str | None = None
# state: Literal['attached', 'detached', 'visible', 'hidden'] | None = None
class GoBackEvent(BaseEvent):
@@ -125,7 +118,7 @@ class SendKeysEvent(BaseEvent):
class UploadFileEvent(BaseEvent):
"""Upload a file to an element."""
element_index: int
node: 'EnhancedDOMTreeNode'
file_path: str
@@ -285,17 +278,6 @@ class BrowserErrorEvent(BaseEvent):
details: dict[str, Any] = Field(default_factory=dict)
# ============================================================================
# Response Events (for request-response pattern)
# ============================================================================
class BrowserStateChangedEvent(BaseEvent):
"""Response to BrowserStateRequestEvent."""
state: Any # BrowserStateSummary object
# ============================================================================
# Storage State Events
# ============================================================================

View File

@@ -2,7 +2,7 @@
import asyncio
import logging
from typing import Any, Self
from typing import TYPE_CHECKING, Any, Self
from bubus import EventBus
from bubus.helpers import retry
@@ -31,6 +31,10 @@ from browser_use.utils import (
time_execution_async,
)
if TYPE_CHECKING:
from cdp_use import CDPClient
from browser_use.dom.views import EnhancedDOMTreeNode
_GLOB_WARNING_SHOWN = False # used inside _is_url_allowed to avoid spamming the logs with the same warning multiple times
MAX_SCREENSHOT_HEIGHT = 2000
@@ -75,15 +79,14 @@ class BrowserSession(BaseModel):
# Connection info (for backwards compatibility)
cdp_url: str | None = None
is_local: bool = Field(default=True)
# Mutable state
current_target_id: str | None = None
"""Current active target ID for the main page"""
# Event bus
event_bus: EventBus = Field(default_factory=EventBus)
# Browser state
_playwright: PlaywrightOrPatchright | None = PrivateAttr(default=None)
_browser: Browser | None = PrivateAttr(default=None)
_browser_context: BrowserContext | None = PrivateAttr(default=None)
# PDF handling
_auto_download_pdfs: bool = PrivateAttr(default=True)
@@ -101,6 +104,13 @@ class BrowserSession(BaseModel):
# Cached browser state for synchronous access
_cached_browser_state_summary: Any = PrivateAttr(default=None)
_cached_selector_map: dict[int, 'EnhancedDOMTreeNode'] = PrivateAttr(default_factory=dict)
"""Cached mapping of element indices to DOM nodes"""
# CDP client
_cdp_client: 'CDPClient | None' = PrivateAttr(default=None)
"""Cached CDP client instance"""
_logger: Any = PrivateAttr(default=None)
@property
@@ -114,6 +124,17 @@ class BrowserSession(BaseModel):
self._logger = logging.getLogger(f'browser_use.{self}')
return self._logger
@property
def cdp_client(self) -> 'CDPClient | None':
"""Get the cached CDP client if it exists.
The client is created and started in setup_browser_via_cdp_url().
Returns:
The CDP client instance or None if not yet created
"""
return self._cdp_client
def __repr__(self) -> str:
port_number_or_pid = (self.cdp_url or str(self.browser_pid) or 'playwright').rsplit(':', 1)[-1].split('/', 1)[0]
return f'BrowserSession🆂 {self.id[-4:]}:{port_number_or_pid} #{str(id(self))[-2:]} (cdp_url={self.cdp_url}, profile={self.browser_profile})'
@@ -121,7 +142,7 @@ class BrowserSession(BaseModel):
def __str__(self) -> str:
# Note: _original_browser_session tracking moved to Agent class
port_number_or_pid = (
(self.cdp_url or self.wss_url or str(self.browser_pid) or 'playwright').rsplit(':', 1)[-1].split('/', 1)[0]
(self.cdp_url or str(self.browser_pid) or 'playwright').rsplit(':', 1)[-1].split('/', 1)[0]
)
return f'BrowserSession🆂 {self.id[-4:]}:{port_number_or_pid} #{str(id(self))[-2:]}' # ' 🅟 {str(id(self.current_target_id))[-2:]}'
@@ -150,38 +171,8 @@ class BrowserSession(BaseModel):
assert self.cdp_url and '://' in self.cdp_url
# Connect via CDP
self._playwright = await async_playwright().start()
# Get connection kwargs and exclude accept_downloads when using CDP download behavior
connect_kwargs = self.browser_profile.kwargs_for_connect().model_dump(exclude={'accept_downloads'})
self._browser = await self._playwright.chromium.connect_over_cdp(
self.cdp_url,
**connect_kwargs,
)
# Enable downloads via CDP Browser.setDownloadBehavior
if self.browser_profile.downloads_path:
try:
cdp_session = await self._browser.new_browser_cdp_session()
await cdp_session.send(
'Browser.setDownloadBehavior',
{'behavior': 'allow', 'downloadPath': str(self.browser_profile.downloads_path)},
)
logger.debug(
f'[Session] Enabled downloads via Browser.setDownloadBehavior to: {self.browser_profile.downloads_path}'
)
except Exception as e:
logger.error(f'[Session] Failed to set browser download behavior via CDP: {e}')
# Get or create browser context
if self._browser.contexts:
self._browser_context = self._browser.contexts[0]
else:
self._browser_context = await self._browser.new_context(
**self.browser_profile.kwargs_for_new_context().model_dump(mode='json', exclude_unset=True)
)
# Setup browser via CDP without Playwright
await self.setup_browser_via_cdp_url()
# Notify that browser is connected
self.event_bus.dispatch(BrowserConnectedEvent(cdp_url=self.cdp_url))
@@ -200,21 +191,14 @@ class BrowserSession(BaseModel):
"""Handle browser stop request."""
try:
# TODO: close all pages here or tell the browser to close gracefully? is there any point?
# we might need to give the browser time to save trace files, recordings, etc. during shutdown
# Check if we should keep the browser alive
if self.browser_profile.keep_alive and not event.force:
self.event_bus.dispatch(BrowserStoppedEvent(reason='Kept alive due to keep_alive=True'))
return
# Close context if we created it
if self._browser_context:
await self._browser_context.close()
self._browser_context = None
# Clean up playwright
if self._playwright:
await self._playwright.stop()
self._playwright = None
# Reset state
self._browser = None
self._browser_context = None
@@ -291,11 +275,7 @@ class BrowserSession(BaseModel):
# Manually copy over the excluded fields that are needed for browser connection
# These fields are excluded in the model config but need to be shared
copy._playwright = self._playwright
copy._browser = self._browser
copy._browser_context = self._browser_context
copy.current_target_id = self.current_target_id
copy.browser_pid = self.browser_pid
return copy
@@ -323,7 +303,7 @@ class BrowserSession(BaseModel):
ws_url = version_info.json()['webSocketDebuggerUrl']
# Create and store the CDP client for direct CDP communication
if not hasattr(self, '_cdp_client'):
if self._cdp_client is None:
self._cdp_client = CDPClient(ws_url)
await self._cdp_client.start()
@@ -400,7 +380,8 @@ class BrowserSession(BaseModel):
})();
}
"""
await self.browser_context.add_init_script(init_script)
# TODO: convert this to pure cdp-use and/or move it to the dom_watchdog.py
# await self.browser_context.add_init_script(init_script)
@property
async def target_ids(self) -> list[str]:
@@ -431,8 +412,6 @@ class BrowserSession(BaseModel):
# Get all page targets using CDP
pages = await self._cdp_get_all_pages()
cdp_client = await self.get_cdp_client()
for i, page_target in enumerate(pages):
target_id = page_target['targetId']
url = page_target['url']
@@ -449,17 +428,17 @@ class BrowserSession(BaseModel):
# Normal pages - try to get title with CDP for reliability
try:
# Attach to target and get session ID
session = await cdp_client.send('Target.attachToTarget', {'targetId': target_id, 'flatten': True})
session = await self.cdp_client.send('Target.attachToTarget', {'targetId': target_id, 'flatten': True})
session_id = session['sessionId']
# Use CDP to evaluate document.title
title_result = await asyncio.wait_for(
cdp_client.send('Runtime.evaluate', {'expression': 'document.title'}, session_id=session_id), timeout=2.0
self.cdp_client.send('Runtime.evaluate', {'expression': 'document.title'}, session_id=session_id), timeout=2.0
)
title = title_result.get('result', {}).get('value', '')
# Detach from target
await cdp_client.send('Target.detachFromTarget', {'sessionId': session_id})
await self.cdp_client.send('Target.detachFromTarget', {'sessionId': session_id})
# Special handling for PDF pages
if (not title or title == '') and (url.endswith('.pdf') or 'pdf' in url):
@@ -754,31 +733,6 @@ class BrowserSession(BaseModel):
# ========== CDP Helper Methods ==========
async def get_cdp_client(self) -> Any:
"""Get the CDP client, creating it if necessary."""
if not hasattr(self, '_cdp_client') or self._cdp_client is None:
if not self.cdp_url:
raise ValueError('CDP URL is not set')
# Import cdp-use client
import httpx
from cdp_use import CDPClient
# Convert HTTP URL to WebSocket URL if needed
ws_url = self.cdp_url
if not ws_url.startswith('ws'):
# If it's an HTTP URL, fetch the WebSocket URL from /json/version endpoint
url = ws_url.rstrip('/')
if not url.endswith('/json/version'):
url = url + '/json/version'
async with httpx.AsyncClient() as client:
version_info = await client.get(url)
ws_url = version_info.json()['webSocketDebuggerUrl']
self._cdp_client = CDPClient(ws_url)
await self._cdp_client.start()
return self._cdp_client
async def get_current_page_cdp_session_id(self) -> str | None:
"""Get the CDP session ID for the current page."""

File diff suppressed because it is too large Load Diff

View File

@@ -187,6 +187,7 @@ class DOMTreeSerializer:
if is_interactive_assign:
node.interactive_index = self._interactive_counter
node.original_node.element_index = self._interactive_counter
self._selector_map[self._interactive_counter] = node.original_node
self._interactive_counter += 1

View File

@@ -46,8 +46,8 @@ class DomService:
logger: logging.Logger
def __init__(self, browser: 'BrowserSession', page: 'Page', logger: logging.Logger | None = None):
self.browser = browser
def __init__(self, browser_session: 'BrowserSession', page: 'Page', logger: logging.Logger | None = None):
self.browser_session = browser_session
self.page = page
self.cdp_client: CDPClient | None = None
@@ -57,16 +57,16 @@ class DomService:
self.session_id_domains_enabled_cache: dict[str, bool] = {}
async def _get_cdp_client(self) -> CDPClient:
if not self.browser.cdp_url:
if not self.browser_session.cdp_url:
raise ValueError('CDP URL is not set')
# TODO: MOVE THIS TO BROWSER SESSION (or sth idk)
# If the cdp_url is already a websocket URL, use it as-is.
if self.browser.cdp_url.startswith('ws'):
ws_url = self.browser.cdp_url
if self.browser_session.cdp_url.startswith('ws'):
ws_url = self.browser_session.cdp_url
else:
# Otherwise, treat it as the DevTools HTTP root and fetch the websocket URL.
url = self.browser.cdp_url.rstrip('/')
url = self.browser_session.cdp_url.rstrip('/')
if not url.endswith('/json/version'):
url = url + '/json/version'
async with httpx.AsyncClient() as client:
@@ -321,7 +321,7 @@ class DomService:
return {'nodes': merged_nodes}
async def _get_all_trees_for_session_id(self, session_id: str) -> TargetAllTrees:
if not self.browser.cdp_url:
if not self.browser_session.cdp_url:
raise ValueError('CDP URL is not set')
cdp_client = await self._get_cdp_client()
@@ -463,6 +463,7 @@ class DomService:
snapshot_node=snapshot_data,
is_visible=None,
absolute_position=absolute_position,
element_index=None,
)
enhanced_dom_tree_node_lookup[node['nodeId']] = dom_tree_node

View File

@@ -1,7 +1,7 @@
import hashlib
from dataclasses import asdict, dataclass, field
from enum import Enum
from typing import Any
from typing import TYPE_CHECKING, Any
from cdp_use.cdp.accessibility.commands import GetFullAXTreeReturns
from cdp_use.cdp.accessibility.types import AXPropertyName
@@ -225,6 +225,9 @@ class EnhancedDOMTreeNode:
# endregion - Snapshot Node data
# Interactive element index
element_index: int | None = None
uuid: str = field(default_factory=uuid7str)
@property
@@ -319,6 +322,19 @@ class EnhancedDOMTreeNode:
'children_nodes': [c.__json__() for c in self.children_nodes] if self.children_nodes else [],
}
async def create_cdp_session(self, browser_session):
"""Create a CDP session for this node's target.
Args:
browser_session: The BrowserSession to use for creating the CDP client
Returns:
CDPClient attached to this node's target
Note: Caller is responsible for cleanup using await cdp_client.stop()
"""
return await browser_session.create_cdp_session_for_node(self)
def get_all_children_text(self, max_depth: int = -1) -> str:
text_parts = []
@@ -364,6 +380,9 @@ class EnhancedDOMTreeNode:
def element_hash(self) -> int:
return hash(self)
def __str__(self) -> str:
return f'[<{self.tag_name}>#{self.frame_id[-4:] if self.frame_id else "?"}:{self.element_index}]'
def __hash__(self) -> int:
"""
Hash the element based on its parent branch path and attributes.