mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
213 lines
5.9 KiB
Python
213 lines
5.9 KiB
Python
# @file purpose: Serializes enhanced DOM trees to HTML format including shadow roots
|
|
|
|
from browser_use.dom.views import EnhancedDOMTreeNode, NodeType
|
|
|
|
|
|
class HTMLSerializer:
|
|
"""Serializes enhanced DOM trees back to HTML format.
|
|
|
|
This serializer reconstructs HTML from the enhanced DOM tree, including:
|
|
- Shadow DOM content (both open and closed)
|
|
- Iframe content documents
|
|
- All attributes and text nodes
|
|
- Proper HTML structure
|
|
|
|
Unlike getOuterHTML which only captures light DOM, this captures the full
|
|
enhanced tree including shadow roots that are crucial for modern SPAs.
|
|
"""
|
|
|
|
def __init__(self, extract_links: bool = False):
|
|
"""Initialize the HTML serializer.
|
|
|
|
Args:
|
|
extract_links: If True, preserves all links. If False, removes href attributes.
|
|
"""
|
|
self.extract_links = extract_links
|
|
|
|
def serialize(self, node: EnhancedDOMTreeNode, depth: int = 0) -> str:
|
|
"""Serialize an enhanced DOM tree node to HTML.
|
|
|
|
Args:
|
|
node: The enhanced DOM tree node to serialize
|
|
depth: Current depth for indentation (internal use)
|
|
|
|
Returns:
|
|
HTML string representation of the node and its descendants
|
|
"""
|
|
if node.node_type == NodeType.DOCUMENT_NODE:
|
|
# Process document root - serialize all children
|
|
parts = []
|
|
for child in node.children_and_shadow_roots:
|
|
child_html = self.serialize(child, depth)
|
|
if child_html:
|
|
parts.append(child_html)
|
|
return ''.join(parts)
|
|
|
|
elif node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
|
|
# Shadow DOM root - wrap in template with shadowrootmode attribute
|
|
parts = []
|
|
|
|
# Add shadow root opening
|
|
shadow_type = node.shadow_root_type or 'open'
|
|
parts.append(f'<template shadowroot="{shadow_type.lower()}">')
|
|
|
|
# Serialize shadow children
|
|
for child in node.children:
|
|
child_html = self.serialize(child, depth + 1)
|
|
if child_html:
|
|
parts.append(child_html)
|
|
|
|
# Close shadow root
|
|
parts.append('</template>')
|
|
|
|
return ''.join(parts)
|
|
|
|
elif node.node_type == NodeType.ELEMENT_NODE:
|
|
parts = []
|
|
tag_name = node.tag_name.lower()
|
|
|
|
# Skip non-content elements
|
|
if tag_name in {'style', 'script', 'head', 'meta', 'link', 'title'}:
|
|
return ''
|
|
|
|
# Skip code tags with display:none - these often contain JSON state for SPAs
|
|
if tag_name == 'code' and node.attributes:
|
|
style = node.attributes.get('style', '')
|
|
# Check if element is hidden (display:none) - likely JSON data
|
|
if 'display:none' in style.replace(' ', '') or 'display: none' in style:
|
|
return ''
|
|
# Also check for bpr-guid IDs (LinkedIn's JSON data pattern)
|
|
element_id = node.attributes.get('id', '')
|
|
if 'bpr-guid' in element_id or 'data' in element_id or 'state' in element_id:
|
|
return ''
|
|
|
|
# Skip base64 inline images - these are usually placeholders or tracking pixels
|
|
if tag_name == 'img' and node.attributes:
|
|
src = node.attributes.get('src', '')
|
|
if src.startswith('data:image/'):
|
|
return ''
|
|
|
|
# Opening tag
|
|
parts.append(f'<{tag_name}')
|
|
|
|
# Add attributes
|
|
if node.attributes:
|
|
attrs = self._serialize_attributes(node.attributes)
|
|
if attrs:
|
|
parts.append(' ' + attrs)
|
|
|
|
# Handle void elements (self-closing)
|
|
void_elements = {
|
|
'area',
|
|
'base',
|
|
'br',
|
|
'col',
|
|
'embed',
|
|
'hr',
|
|
'img',
|
|
'input',
|
|
'link',
|
|
'meta',
|
|
'param',
|
|
'source',
|
|
'track',
|
|
'wbr',
|
|
}
|
|
if tag_name in void_elements:
|
|
parts.append(' />')
|
|
return ''.join(parts)
|
|
|
|
parts.append('>')
|
|
|
|
# Handle iframe content document
|
|
if tag_name in {'iframe', 'frame'} and node.content_document:
|
|
# Serialize iframe content
|
|
for child in node.content_document.children_nodes or []:
|
|
child_html = self.serialize(child, depth + 1)
|
|
if child_html:
|
|
parts.append(child_html)
|
|
else:
|
|
# Serialize shadow roots FIRST (for declarative shadow DOM)
|
|
if node.shadow_roots:
|
|
for shadow_root in node.shadow_roots:
|
|
child_html = self.serialize(shadow_root, depth + 1)
|
|
if child_html:
|
|
parts.append(child_html)
|
|
|
|
# Then serialize light DOM children (for slot projection)
|
|
for child in node.children:
|
|
child_html = self.serialize(child, depth + 1)
|
|
if child_html:
|
|
parts.append(child_html)
|
|
|
|
# Closing tag
|
|
parts.append(f'</{tag_name}>')
|
|
|
|
return ''.join(parts)
|
|
|
|
elif node.node_type == NodeType.TEXT_NODE:
|
|
# Return text content with basic HTML escaping
|
|
if node.node_value:
|
|
return self._escape_html(node.node_value)
|
|
return ''
|
|
|
|
elif node.node_type == NodeType.COMMENT_NODE:
|
|
# Skip comments to reduce noise
|
|
return ''
|
|
|
|
else:
|
|
# Unknown node type - skip
|
|
return ''
|
|
|
|
def _serialize_attributes(self, attributes: dict[str, str]) -> str:
|
|
"""Serialize element attributes to HTML attribute string.
|
|
|
|
Args:
|
|
attributes: Dictionary of attribute names to values
|
|
|
|
Returns:
|
|
HTML attribute string (e.g., 'class="foo" id="bar"')
|
|
"""
|
|
parts = []
|
|
for key, value in attributes.items():
|
|
# Skip href if not extracting links
|
|
if not self.extract_links and key == 'href':
|
|
continue
|
|
|
|
# Skip data-* attributes as they often contain JSON payloads
|
|
# These are used by modern SPAs (React, Vue, Angular) for state management
|
|
if key.startswith('data-'):
|
|
continue
|
|
|
|
# Handle boolean attributes
|
|
if value == '' or value is None:
|
|
parts.append(key)
|
|
else:
|
|
# Escape attribute value
|
|
escaped_value = self._escape_attribute(value)
|
|
parts.append(f'{key}="{escaped_value}"')
|
|
|
|
return ' '.join(parts)
|
|
|
|
def _escape_html(self, text: str) -> str:
|
|
"""Escape HTML special characters in text content.
|
|
|
|
Args:
|
|
text: Raw text content
|
|
|
|
Returns:
|
|
HTML-escaped text
|
|
"""
|
|
return text.replace('&', '&').replace('<', '<').replace('>', '>')
|
|
|
|
def _escape_attribute(self, value: str) -> str:
|
|
"""Escape HTML special characters in attribute values.
|
|
|
|
Args:
|
|
value: Raw attribute value
|
|
|
|
Returns:
|
|
HTML-escaped attribute value
|
|
"""
|
|
return value.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
|