mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
170 lines
6.3 KiB
Python
170 lines
6.3 KiB
Python
"""
|
|
Shared markdown extraction utilities for browser content processing.
|
|
|
|
This module provides a unified interface for extracting clean markdown from browser content,
|
|
used by both the tools service and page actor.
|
|
"""
|
|
|
|
import re
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from browser_use.dom.serializer.html_serializer import HTMLSerializer
|
|
from browser_use.dom.service import DomService
|
|
|
|
if TYPE_CHECKING:
|
|
from browser_use.browser.session import BrowserSession
|
|
from browser_use.browser.watchdogs.dom_watchdog import DOMWatchdog
|
|
|
|
|
|
async def extract_clean_markdown(
|
|
browser_session: 'BrowserSession | None' = None,
|
|
dom_service: DomService | None = None,
|
|
target_id: str | None = None,
|
|
extract_links: bool = False,
|
|
) -> tuple[str, dict[str, Any]]:
|
|
"""Extract clean markdown from browser content using enhanced DOM tree.
|
|
|
|
This unified function can extract markdown using either a browser session (for tools service)
|
|
or a DOM service with target ID (for page actor).
|
|
|
|
Args:
|
|
browser_session: Browser session to extract content from (tools service path)
|
|
dom_service: DOM service instance (page actor path)
|
|
target_id: Target ID for the page (required when using dom_service)
|
|
extract_links: Whether to preserve links in markdown
|
|
|
|
Returns:
|
|
tuple: (clean_markdown_content, content_statistics)
|
|
|
|
Raises:
|
|
ValueError: If neither browser_session nor (dom_service + target_id) are provided
|
|
"""
|
|
# Validate input parameters
|
|
if browser_session is not None:
|
|
if dom_service is not None or target_id is not None:
|
|
raise ValueError('Cannot specify both browser_session and dom_service/target_id')
|
|
# Browser session path (tools service)
|
|
enhanced_dom_tree = await _get_enhanced_dom_tree_from_browser_session(browser_session)
|
|
current_url = await browser_session.get_current_page_url()
|
|
method = 'enhanced_dom_tree'
|
|
elif dom_service is not None and target_id is not None:
|
|
# DOM service path (page actor)
|
|
enhanced_dom_tree = await dom_service.get_dom_tree(target_id=target_id)
|
|
current_url = None # Not available via DOM service
|
|
method = 'dom_service'
|
|
else:
|
|
raise ValueError('Must provide either browser_session or both dom_service and target_id')
|
|
|
|
# Use the HTML serializer with the enhanced DOM tree
|
|
html_serializer = HTMLSerializer(extract_links=extract_links)
|
|
page_html = html_serializer.serialize(enhanced_dom_tree)
|
|
|
|
original_html_length = len(page_html)
|
|
|
|
# Use markdownify for clean markdown conversion
|
|
from markdownify import markdownify as md
|
|
|
|
content = md(
|
|
page_html,
|
|
heading_style='ATX', # Use # style headings
|
|
strip=['script', 'style'], # Remove these tags
|
|
bullets='-', # Use - for unordered lists
|
|
code_language='', # Don't add language to code blocks
|
|
escape_asterisks=False, # Don't escape asterisks (cleaner output)
|
|
escape_underscores=False, # Don't escape underscores (cleaner output)
|
|
escape_misc=False, # Don't escape other characters (cleaner output)
|
|
autolinks=False, # Don't convert URLs to <> format
|
|
default_title=False, # Don't add default title attributes
|
|
keep_inline_images_in=[], # Don't keep inline images in any tags (we already filter base64 in HTML)
|
|
)
|
|
|
|
initial_markdown_length = len(content)
|
|
|
|
# Minimal cleanup - markdownify already does most of the work
|
|
content = re.sub(r'%[0-9A-Fa-f]{2}', '', content) # Remove any remaining URL encoding
|
|
|
|
# Apply light preprocessing to clean up excessive whitespace
|
|
content, chars_filtered = _preprocess_markdown_content(content)
|
|
|
|
final_filtered_length = len(content)
|
|
|
|
# Content statistics
|
|
stats = {
|
|
'method': method,
|
|
'original_html_chars': original_html_length,
|
|
'initial_markdown_chars': initial_markdown_length,
|
|
'filtered_chars_removed': chars_filtered,
|
|
'final_filtered_chars': final_filtered_length,
|
|
}
|
|
|
|
# Add URL to stats if available
|
|
if current_url:
|
|
stats['url'] = current_url
|
|
|
|
return content, stats
|
|
|
|
|
|
async def _get_enhanced_dom_tree_from_browser_session(browser_session: 'BrowserSession'):
|
|
"""Get enhanced DOM tree from browser session via DOMWatchdog."""
|
|
# Get the enhanced DOM tree from DOMWatchdog
|
|
# This captures the current state of the page including dynamic content, shadow roots, etc.
|
|
dom_watchdog: DOMWatchdog | None = browser_session._dom_watchdog
|
|
assert dom_watchdog is not None, 'DOMWatchdog not available'
|
|
|
|
# Use cached enhanced DOM tree if available, otherwise build it
|
|
if dom_watchdog.enhanced_dom_tree is not None:
|
|
return dom_watchdog.enhanced_dom_tree
|
|
|
|
# Build the enhanced DOM tree if not cached
|
|
await dom_watchdog._build_dom_tree_without_highlights()
|
|
enhanced_dom_tree = dom_watchdog.enhanced_dom_tree
|
|
assert enhanced_dom_tree is not None, 'Enhanced DOM tree not available'
|
|
|
|
return enhanced_dom_tree
|
|
|
|
|
|
# Legacy aliases removed - all code now uses the unified extract_clean_markdown function
|
|
|
|
|
|
def _preprocess_markdown_content(content: str, max_newlines: int = 3) -> tuple[str, int]:
|
|
"""
|
|
Light preprocessing of markdown output - minimal cleanup with JSON blob removal.
|
|
|
|
Args:
|
|
content: Markdown content to lightly filter
|
|
max_newlines: Maximum consecutive newlines to allow
|
|
|
|
Returns:
|
|
tuple: (filtered_content, chars_filtered)
|
|
"""
|
|
original_length = len(content)
|
|
|
|
# Remove JSON blobs (common in SPAs like LinkedIn, Facebook, etc.)
|
|
# These are often embedded as `{"key":"value",...}` and can be massive
|
|
# Match JSON objects/arrays that are at least 100 chars long
|
|
# This catches SPA state/config data without removing small inline JSON
|
|
content = re.sub(r'`\{["\w].*?\}`', '', content, flags=re.DOTALL) # Remove JSON in code blocks
|
|
content = re.sub(r'\{"\$type":[^}]{100,}\}', '', content) # Remove JSON with $type fields (common pattern)
|
|
content = re.sub(r'\{"[^"]{5,}":\{[^}]{100,}\}', '', content) # Remove nested JSON objects
|
|
|
|
# Compress consecutive newlines (4+ newlines become max_newlines)
|
|
content = re.sub(r'\n{4,}', '\n' * max_newlines, content)
|
|
|
|
# Remove lines that are only whitespace or very short (likely artifacts)
|
|
lines = content.split('\n')
|
|
filtered_lines = []
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
# Keep lines with substantial content
|
|
if len(stripped) > 2:
|
|
# Skip lines that look like JSON (start with { or [ and are very long)
|
|
if (stripped.startswith('{') or stripped.startswith('[')) and len(stripped) > 100:
|
|
continue
|
|
filtered_lines.append(line)
|
|
|
|
content = '\n'.join(filtered_lines)
|
|
content = content.strip()
|
|
|
|
chars_filtered = original_length - len(content)
|
|
return content, chars_filtered
|