mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
479 lines
15 KiB
Python
479 lines
15 KiB
Python
# @file purpose: Concise evaluation serializer for DOM trees - optimized for LLM query writing
|
|
|
|
|
|
from browser_use.dom.utils import cap_text_length
|
|
from browser_use.dom.views import (
|
|
EnhancedDOMTreeNode,
|
|
NodeType,
|
|
SimplifiedNode,
|
|
)
|
|
|
|
# Critical attributes for query writing and form interaction
|
|
# NOTE: Removed 'id' and 'class' to force more robust structural selectors
|
|
EVAL_KEY_ATTRIBUTES = [
|
|
'id', # Removed - can have special chars, forces structural selectors
|
|
'class', # Removed - can have special chars like +, forces structural selectors
|
|
'name',
|
|
'type',
|
|
'placeholder',
|
|
'aria-label',
|
|
'role',
|
|
'value',
|
|
# 'href',
|
|
'data-testid',
|
|
'alt', # for images
|
|
'title', # useful for tooltips/link context
|
|
# State attributes (critical for form interaction)
|
|
'checked',
|
|
'selected',
|
|
'disabled',
|
|
'required',
|
|
'readonly',
|
|
# ARIA states
|
|
'aria-expanded',
|
|
'aria-pressed',
|
|
'aria-checked',
|
|
'aria-selected',
|
|
'aria-invalid',
|
|
# Validation attributes (help agents avoid brute force)
|
|
'pattern',
|
|
'min',
|
|
'max',
|
|
'minlength',
|
|
'maxlength',
|
|
'step',
|
|
'aria-valuemin',
|
|
'aria-valuemax',
|
|
'aria-valuenow',
|
|
]
|
|
|
|
# Semantic elements that should always be shown
|
|
SEMANTIC_ELEMENTS = {
|
|
'html', # Always show document root
|
|
'body', # Always show body
|
|
'h1',
|
|
'h2',
|
|
'h3',
|
|
'h4',
|
|
'h5',
|
|
'h6',
|
|
'a',
|
|
'button',
|
|
'input',
|
|
'textarea',
|
|
'select',
|
|
'form',
|
|
'label',
|
|
'nav',
|
|
'header',
|
|
'footer',
|
|
'main',
|
|
'article',
|
|
'section',
|
|
'table',
|
|
'thead',
|
|
'tbody',
|
|
'tr',
|
|
'th',
|
|
'td',
|
|
'ul',
|
|
'ol',
|
|
'li',
|
|
'img',
|
|
'iframe',
|
|
'video',
|
|
'audio',
|
|
}
|
|
|
|
# Container elements that can be collapsed if they only wrap one child
|
|
COLLAPSIBLE_CONTAINERS = {'div', 'span', 'section', 'article'}
|
|
|
|
# SVG child elements to skip (decorative only, no interaction value)
|
|
SVG_ELEMENTS = {
|
|
'path',
|
|
'rect',
|
|
'g',
|
|
'circle',
|
|
'ellipse',
|
|
'line',
|
|
'polyline',
|
|
'polygon',
|
|
'use',
|
|
'defs',
|
|
'clipPath',
|
|
'mask',
|
|
'pattern',
|
|
'image',
|
|
'text',
|
|
'tspan',
|
|
}
|
|
|
|
|
|
class DOMEvalSerializer:
|
|
"""Ultra-concise DOM serializer for quick LLM query writing."""
|
|
|
|
@staticmethod
|
|
def serialize_tree(node: SimplifiedNode | None, include_attributes: list[str], depth: int = 0) -> str:
|
|
"""
|
|
Serialize complete DOM tree structure for LLM understanding.
|
|
|
|
Strategy:
|
|
- Show ALL elements to preserve DOM structure
|
|
- Non-interactive elements show just tag name
|
|
- Interactive elements show full attributes + [index]
|
|
- Self-closing tags only (no closing tags)
|
|
"""
|
|
if not node:
|
|
return ''
|
|
|
|
# Skip excluded nodes but process children
|
|
if hasattr(node, 'excluded_by_parent') and node.excluded_by_parent:
|
|
return DOMEvalSerializer._serialize_children(node, include_attributes, depth)
|
|
|
|
# Skip nodes marked as should_display=False
|
|
if not node.should_display:
|
|
return DOMEvalSerializer._serialize_children(node, include_attributes, depth)
|
|
|
|
formatted_text = []
|
|
depth_str = depth * '\t'
|
|
|
|
if node.original_node.node_type == NodeType.ELEMENT_NODE:
|
|
tag = node.original_node.tag_name.lower()
|
|
is_visible = node.original_node.snapshot_node and node.original_node.is_visible
|
|
|
|
# Container elements that should be shown even if invisible (might have visible children)
|
|
container_tags = {'html', 'body', 'div', 'main', 'section', 'article', 'aside', 'header', 'footer', 'nav'}
|
|
|
|
# Skip invisible elements UNLESS they're containers or iframes (which might have visible children)
|
|
if not is_visible and tag not in container_tags and tag not in ['iframe', 'frame']:
|
|
return DOMEvalSerializer._serialize_children(node, include_attributes, depth)
|
|
|
|
# Special handling for iframes - show them with their content
|
|
if tag in ['iframe', 'frame']:
|
|
return DOMEvalSerializer._serialize_iframe(node, include_attributes, depth)
|
|
|
|
# Skip SVG elements entirely - they're just decorative graphics with no interaction value
|
|
# Show the <svg> tag itself to indicate graphics, but don't recurse into children
|
|
if tag == 'svg':
|
|
line = f'{depth_str}'
|
|
# Add [i_X] for interactive SVG elements only
|
|
if node.is_interactive:
|
|
line += f'[i_{node.original_node.backend_node_id}] '
|
|
line += '<svg'
|
|
attributes_str = DOMEvalSerializer._build_compact_attributes(node.original_node)
|
|
if attributes_str:
|
|
line += f' {attributes_str}'
|
|
line += ' /> <!-- SVG content collapsed -->'
|
|
return line
|
|
|
|
# Skip SVG child elements entirely (path, rect, g, circle, etc.)
|
|
if tag in SVG_ELEMENTS:
|
|
return ''
|
|
|
|
# Build compact attributes string
|
|
attributes_str = DOMEvalSerializer._build_compact_attributes(node.original_node)
|
|
|
|
# Decide if this element should be shown
|
|
is_semantic = tag in SEMANTIC_ELEMENTS
|
|
has_useful_attrs = bool(attributes_str)
|
|
has_text_content = DOMEvalSerializer._has_direct_text(node)
|
|
has_children = len(node.children) > 0
|
|
|
|
# Build compact element representation
|
|
line = f'{depth_str}'
|
|
# Add backend node ID notation - [i_X] for interactive elements only
|
|
if node.is_interactive:
|
|
line += f'[i_{node.original_node.backend_node_id}] '
|
|
# Non-interactive elements don't get an index notation
|
|
line += f'<{tag}'
|
|
|
|
if attributes_str:
|
|
line += f' {attributes_str}'
|
|
|
|
# Add scroll info if element is scrollable
|
|
if node.original_node.should_show_scroll_info:
|
|
scroll_text = node.original_node.get_scroll_info_text()
|
|
if scroll_text:
|
|
line += f' scroll="{scroll_text}"'
|
|
|
|
# Add inline text if present (keep it on same line for compactness)
|
|
inline_text = DOMEvalSerializer._get_inline_text(node)
|
|
|
|
# For containers (html, body, div, etc.), always show children even if there's inline text
|
|
# For other elements, inline text replaces children (more compact)
|
|
is_container = tag in container_tags
|
|
|
|
if inline_text and not is_container:
|
|
line += f'>{inline_text}'
|
|
else:
|
|
line += ' />'
|
|
|
|
formatted_text.append(line)
|
|
|
|
# Process children (always for containers, only if no inline_text for others)
|
|
if has_children and (is_container or not inline_text):
|
|
children_text = DOMEvalSerializer._serialize_children(node, include_attributes, depth + 1)
|
|
if children_text:
|
|
formatted_text.append(children_text)
|
|
|
|
elif node.original_node.node_type == NodeType.TEXT_NODE:
|
|
# Text nodes are handled inline with their parent
|
|
pass
|
|
|
|
elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
|
|
# Shadow DOM - just show children directly with minimal marker
|
|
if node.children:
|
|
formatted_text.append(f'{depth_str}#shadow')
|
|
children_text = DOMEvalSerializer._serialize_children(node, include_attributes, depth + 1)
|
|
if children_text:
|
|
formatted_text.append(children_text)
|
|
|
|
return '\n'.join(formatted_text)
|
|
|
|
@staticmethod
|
|
def _serialize_children(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
|
|
"""Helper to serialize all children of a node."""
|
|
children_output = []
|
|
|
|
# Check if parent is a list container (ul, ol)
|
|
is_list_container = node.original_node.node_type == NodeType.ELEMENT_NODE and node.original_node.tag_name.lower() in [
|
|
'ul',
|
|
'ol',
|
|
]
|
|
|
|
# Track list items and consecutive links
|
|
li_count = 0
|
|
max_list_items = 50
|
|
consecutive_link_count = 0
|
|
max_consecutive_links = 50
|
|
total_links_skipped = 0
|
|
|
|
for child in node.children:
|
|
# Get tag name for this child
|
|
current_tag = None
|
|
if child.original_node.node_type == NodeType.ELEMENT_NODE:
|
|
current_tag = child.original_node.tag_name.lower()
|
|
|
|
# If we're in a list container and this child is an li element
|
|
if is_list_container and current_tag == 'li':
|
|
li_count += 1
|
|
# Skip li elements after the 5th one
|
|
if li_count > max_list_items:
|
|
continue
|
|
|
|
# Track consecutive anchor tags (links)
|
|
if current_tag == 'a':
|
|
consecutive_link_count += 1
|
|
# Skip links after the 5th consecutive one
|
|
if consecutive_link_count > max_consecutive_links:
|
|
total_links_skipped += 1
|
|
continue
|
|
else:
|
|
# Reset counter when we hit a non-link element
|
|
# But first add truncation message if we skipped links
|
|
if total_links_skipped > 0:
|
|
depth_str = depth * '\t'
|
|
children_output.append(f'{depth_str}... ({total_links_skipped} more links in this list)')
|
|
total_links_skipped = 0
|
|
consecutive_link_count = 0
|
|
|
|
child_text = DOMEvalSerializer.serialize_tree(child, include_attributes, depth)
|
|
if child_text:
|
|
children_output.append(child_text)
|
|
|
|
# Add truncation message if we skipped items at the end
|
|
if is_list_container and li_count > max_list_items:
|
|
depth_str = depth * '\t'
|
|
children_output.append(
|
|
f'{depth_str}... ({li_count - max_list_items} more items in this list (truncated) use evaluate to get more.'
|
|
)
|
|
|
|
# Add truncation message for links if we skipped any at the end
|
|
if total_links_skipped > 0:
|
|
depth_str = depth * '\t'
|
|
children_output.append(
|
|
f'{depth_str}... ({total_links_skipped} more links in this list) (truncated) use evaluate to get more.'
|
|
)
|
|
|
|
return '\n'.join(children_output)
|
|
|
|
@staticmethod
|
|
def _build_compact_attributes(node: EnhancedDOMTreeNode) -> str:
|
|
"""Build ultra-compact attributes string with only key attributes."""
|
|
attrs = []
|
|
|
|
# Prioritize attributes that help with query writing
|
|
if node.attributes:
|
|
for attr in EVAL_KEY_ATTRIBUTES:
|
|
if attr in node.attributes:
|
|
value = str(node.attributes[attr]).strip()
|
|
if not value:
|
|
continue
|
|
|
|
# Special handling for different attributes
|
|
if attr == 'class':
|
|
# For class, limit to first 2 classes to save space
|
|
classes = value.split()[:3]
|
|
value = ' '.join(classes)
|
|
elif attr == 'href':
|
|
# For href, cap at 20 chars to save space
|
|
value = cap_text_length(value, 80)
|
|
else:
|
|
# Cap at 25 chars for other attributes
|
|
value = cap_text_length(value, 80)
|
|
|
|
attrs.append(f'{attr}="{value}"')
|
|
|
|
# Note: We intentionally don't add role from ax_node here because:
|
|
# 1. If role is explicitly set in HTML, it's already captured above via EVAL_KEY_ATTRIBUTES
|
|
# 2. Inferred roles from AX tree (like link, listitem, LineBreak) are redundant with the tag name
|
|
# 3. This reduces noise - <a href="..." role="link"> is redundant, we already know <a> is a link
|
|
|
|
return ' '.join(attrs)
|
|
|
|
@staticmethod
|
|
def _has_direct_text(node: SimplifiedNode) -> bool:
|
|
"""Check if node has direct text children (not nested in other elements)."""
|
|
for child in node.children:
|
|
if child.original_node.node_type == NodeType.TEXT_NODE:
|
|
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
|
|
if len(text) > 1:
|
|
return True
|
|
return False
|
|
|
|
@staticmethod
|
|
def _get_inline_text(node: SimplifiedNode) -> str:
|
|
"""Get text content to display inline (max 40 chars)."""
|
|
text_parts = []
|
|
for child in node.children:
|
|
if child.original_node.node_type == NodeType.TEXT_NODE:
|
|
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
|
|
if text and len(text) > 1:
|
|
text_parts.append(text)
|
|
|
|
if not text_parts:
|
|
return ''
|
|
|
|
combined = ' '.join(text_parts)
|
|
return cap_text_length(combined, 80)
|
|
|
|
@staticmethod
|
|
def _serialize_iframe(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
|
|
"""Handle iframe serialization with content document."""
|
|
formatted_text = []
|
|
depth_str = depth * '\t'
|
|
tag = node.original_node.tag_name.lower()
|
|
|
|
# Build minimal iframe marker with key attributes
|
|
attributes_str = DOMEvalSerializer._build_compact_attributes(node.original_node)
|
|
line = f'{depth_str}<{tag}'
|
|
if attributes_str:
|
|
line += f' {attributes_str}'
|
|
|
|
# Add scroll info for iframe content
|
|
if node.original_node.should_show_scroll_info:
|
|
scroll_text = node.original_node.get_scroll_info_text()
|
|
if scroll_text:
|
|
line += f' scroll="{scroll_text}"'
|
|
|
|
line += ' />'
|
|
formatted_text.append(line)
|
|
|
|
# If iframe has content document, serialize its content
|
|
if node.original_node.content_document:
|
|
# Add marker for iframe content
|
|
formatted_text.append(f'{depth_str}\t#iframe-content')
|
|
|
|
# Process content document children
|
|
for child_node in node.original_node.content_document.children_nodes or []:
|
|
# Process html documents
|
|
if child_node.tag_name.lower() == 'html':
|
|
# Find and serialize body content only (skip head)
|
|
for html_child in child_node.children:
|
|
if html_child.tag_name.lower() == 'body':
|
|
for body_child in html_child.children:
|
|
# Recursively process body children (iframe content)
|
|
DOMEvalSerializer._serialize_document_node(
|
|
body_child, formatted_text, include_attributes, depth + 2, is_iframe_content=True
|
|
)
|
|
break # Stop after processing body
|
|
else:
|
|
# Not an html element - serialize directly
|
|
DOMEvalSerializer._serialize_document_node(
|
|
child_node, formatted_text, include_attributes, depth + 1, is_iframe_content=True
|
|
)
|
|
|
|
return '\n'.join(formatted_text)
|
|
|
|
@staticmethod
|
|
def _serialize_document_node(
|
|
dom_node: EnhancedDOMTreeNode,
|
|
output: list[str],
|
|
include_attributes: list[str],
|
|
depth: int,
|
|
is_iframe_content: bool = True,
|
|
) -> None:
|
|
"""Helper to serialize a document node without SimplifiedNode wrapper.
|
|
|
|
Args:
|
|
is_iframe_content: If True, be more permissive with visibility checks since
|
|
iframe content might not have snapshot data from parent page.
|
|
"""
|
|
depth_str = depth * '\t'
|
|
|
|
if dom_node.node_type == NodeType.ELEMENT_NODE:
|
|
tag = dom_node.tag_name.lower()
|
|
|
|
# For iframe content, be permissive - show all semantic elements even without snapshot data
|
|
# For regular content, skip invisible elements
|
|
if is_iframe_content:
|
|
# Only skip if we have snapshot data AND it's explicitly invisible
|
|
# If no snapshot data, assume visible (cross-origin iframe content)
|
|
is_visible = (not dom_node.snapshot_node) or dom_node.is_visible
|
|
else:
|
|
# Regular strict visibility check
|
|
is_visible = dom_node.snapshot_node and dom_node.is_visible
|
|
|
|
if not is_visible:
|
|
return
|
|
|
|
# Check if semantic or has useful attributes
|
|
is_semantic = tag in SEMANTIC_ELEMENTS
|
|
attributes_str = DOMEvalSerializer._build_compact_attributes(dom_node)
|
|
|
|
if not is_semantic and not attributes_str:
|
|
# Skip but process children
|
|
for child in dom_node.children:
|
|
DOMEvalSerializer._serialize_document_node(
|
|
child, output, include_attributes, depth, is_iframe_content=is_iframe_content
|
|
)
|
|
return
|
|
|
|
# Build element line
|
|
line = f'{depth_str}<{tag}'
|
|
if attributes_str:
|
|
line += f' {attributes_str}'
|
|
|
|
# Get direct text content
|
|
text_parts = []
|
|
for child in dom_node.children:
|
|
if child.node_type == NodeType.TEXT_NODE and child.node_value:
|
|
text = child.node_value.strip()
|
|
if text and len(text) > 1:
|
|
text_parts.append(text)
|
|
|
|
if text_parts:
|
|
combined = ' '.join(text_parts)
|
|
line += f'>{cap_text_length(combined, 100)}'
|
|
else:
|
|
line += ' />'
|
|
|
|
output.append(line)
|
|
|
|
# Process non-text children
|
|
for child in dom_node.children:
|
|
if child.node_type != NodeType.TEXT_NODE:
|
|
DOMEvalSerializer._serialize_document_node(
|
|
child, output, include_attributes, depth + 1, is_iframe_content=is_iframe_content
|
|
)
|