Files
browser-use/browser_use/dom/enhanced_snapshot.py

182 lines
7.2 KiB
Python

"""
Enhanced snapshot processing for browser-use DOM tree extraction.
This module provides stateless functions for parsing Chrome DevTools Protocol (CDP) DOMSnapshot data
to extract visibility, clickability, cursor styles, and other layout information.
"""
from cdp_use.cdp.domsnapshot.commands import CaptureSnapshotReturns
from cdp_use.cdp.domsnapshot.types import (
LayoutTreeSnapshot,
NodeTreeSnapshot,
)
from browser_use.dom.views import DOMRect, EnhancedSnapshotNode
# Only the ESSENTIAL computed styles for interactivity and visibility detection
REQUIRED_COMPUTED_STYLES = [
# Only styles actually accessed in the codebase (prevents Chrome crashes on heavy sites)
'display', # Used in service.py visibility detection
'visibility', # Used in service.py visibility detection
'opacity', # Used in service.py visibility detection
'overflow', # Used in views.py scrollability detection
'overflow-x', # Used in views.py scrollability detection
'overflow-y', # Used in views.py scrollability detection
'cursor', # Used in enhanced_snapshot.py cursor extraction
'pointer-events', # Used for clickability logic
'position', # Used for visibility logic
'background-color', # Used for visibility logic
]
def _parse_rare_boolean_data(rare_data_set: set[int], index: int) -> bool | None:
"""Parse rare boolean data from snapshot - returns True if index is in the rare data set."""
return index in rare_data_set
def _parse_computed_styles(strings: list[str], style_indices: list[int]) -> dict[str, str]:
"""Parse computed styles from layout tree using string indices."""
styles = {}
for i, style_index in enumerate(style_indices):
if i < len(REQUIRED_COMPUTED_STYLES) and 0 <= style_index < len(strings):
styles[REQUIRED_COMPUTED_STYLES[i]] = strings[style_index]
return styles
def build_snapshot_lookup(
snapshot: CaptureSnapshotReturns,
device_pixel_ratio: float = 1.0,
) -> dict[int, EnhancedSnapshotNode]:
"""Build a lookup table of backend node ID to enhanced snapshot data with everything calculated upfront."""
import logging
logger = logging.getLogger('browser_use.dom.enhanced_snapshot')
snapshot_lookup: dict[int, EnhancedSnapshotNode] = {}
if not snapshot['documents']:
return snapshot_lookup
strings = snapshot['strings']
logger.debug(f'🔍 SNAPSHOT: Processing {len(snapshot["documents"])} documents with {len(strings)} strings')
for doc_idx, document in enumerate(snapshot['documents']):
nodes: NodeTreeSnapshot = document['nodes']
layout: LayoutTreeSnapshot = document['layout']
# Build backend node id to snapshot index lookup
backend_node_to_snapshot_index = {}
if 'backendNodeId' in nodes:
for i, backend_node_id in enumerate(nodes['backendNodeId']):
backend_node_to_snapshot_index[backend_node_id] = i
# Log document info
doc_url = strings[document.get('documentURL', 0)] if document.get('documentURL', 0) < len(strings) else 'N/A'
logger.debug(
f'🔍 SNAPSHOT doc[{doc_idx}]: url={doc_url[:80]}... has {len(backend_node_to_snapshot_index)} nodes, '
f'layout has {len(layout.get("nodeIndex", []))} entries'
)
# PERFORMANCE: Pre-build layout index map to eliminate O(n²) double lookups
# Preserve original behavior: use FIRST occurrence for duplicates
layout_index_map = {}
if layout and 'nodeIndex' in layout:
for layout_idx, node_index in enumerate(layout['nodeIndex']):
if node_index not in layout_index_map: # Only store first occurrence
layout_index_map[node_index] = layout_idx
# Pre-convert rare boolean data from list to set for O(1) lookups.
# The raw CDP data uses List[int] which makes `index in list` O(n).
# Called once per node, this was O(n²) total — the #1 bottleneck.
# At 20k elements: 5,925ms (list) → 2ms (set) = 3,000x speedup.
has_clickable_data = 'isClickable' in nodes
is_clickable_set: set[int] = set(nodes['isClickable']['index']) if has_clickable_data else set()
# Build snapshot lookup for each backend node id
for backend_node_id, snapshot_index in backend_node_to_snapshot_index.items():
is_clickable = None
if has_clickable_data:
is_clickable = _parse_rare_boolean_data(is_clickable_set, snapshot_index)
# Find corresponding layout node
cursor_style = None
is_visible = None
bounding_box = None
computed_styles = {}
# Look for layout tree node that corresponds to this snapshot node
paint_order = None
client_rects = None
scroll_rects = None
stacking_contexts = None
if snapshot_index in layout_index_map:
layout_idx = layout_index_map[snapshot_index]
if layout_idx < len(layout.get('bounds', [])):
# Parse bounding box
bounds = layout['bounds'][layout_idx]
if len(bounds) >= 4:
# IMPORTANT: CDP coordinates are in device pixels, convert to CSS pixels
# by dividing by the device pixel ratio
raw_x, raw_y, raw_width, raw_height = bounds[0], bounds[1], bounds[2], bounds[3]
# Apply device pixel ratio scaling to convert device pixels to CSS pixels
bounding_box = DOMRect(
x=raw_x / device_pixel_ratio,
y=raw_y / device_pixel_ratio,
width=raw_width / device_pixel_ratio,
height=raw_height / device_pixel_ratio,
)
# Parse computed styles for this layout node
if layout_idx < len(layout.get('styles', [])):
style_indices = layout['styles'][layout_idx]
computed_styles = _parse_computed_styles(strings, style_indices)
cursor_style = computed_styles.get('cursor')
# Extract paint order if available
if layout_idx < len(layout.get('paintOrders', [])):
paint_order = layout.get('paintOrders', [])[layout_idx]
# Extract client rects if available
client_rects_data = layout.get('clientRects', [])
if layout_idx < len(client_rects_data):
client_rect_data = client_rects_data[layout_idx]
if client_rect_data and len(client_rect_data) >= 4:
client_rects = DOMRect(
x=client_rect_data[0],
y=client_rect_data[1],
width=client_rect_data[2],
height=client_rect_data[3],
)
# Extract scroll rects if available
scroll_rects_data = layout.get('scrollRects', [])
if layout_idx < len(scroll_rects_data):
scroll_rect_data = scroll_rects_data[layout_idx]
if scroll_rect_data and len(scroll_rect_data) >= 4:
scroll_rects = DOMRect(
x=scroll_rect_data[0],
y=scroll_rect_data[1],
width=scroll_rect_data[2],
height=scroll_rect_data[3],
)
# Extract stacking contexts if available
if layout_idx < len(layout.get('stackingContexts', [])):
stacking_contexts = layout.get('stackingContexts', {}).get('index', [])[layout_idx]
snapshot_lookup[backend_node_id] = EnhancedSnapshotNode(
is_clickable=is_clickable,
cursor_style=cursor_style,
bounds=bounding_box,
clientRects=client_rects,
scrollRects=scroll_rects,
computed_styles=computed_styles if computed_styles else None,
paint_order=paint_order,
stacking_contexts=stacking_contexts,
)
# Count how many have bounds (are actually visible/laid out)
with_bounds = sum(1 for n in snapshot_lookup.values() if n.bounds)
logger.debug(f'🔍 SNAPSHOT: Built lookup with {len(snapshot_lookup)} total entries, {with_bounds} have bounds')
return snapshot_lookup