Merge branch 'main' into oracle_oci_integration

This commit is contained in:
Talapally Sandeep Kumar
2025-09-18 00:20:39 +05:30
committed by GitHub
8 changed files with 216 additions and 77 deletions

View File

@@ -2,6 +2,7 @@ import importlib.resources
from datetime import datetime
from typing import TYPE_CHECKING, Literal, Optional
from browser_use.dom.views import NodeType, SimplifiedNode
from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL, SystemMessage, UserMessage
from browser_use.observability import observe_debug
from browser_use.utils import is_new_tab_page
@@ -112,8 +113,93 @@ class AgentMessagePrompt:
self.sample_images = sample_images or []
assert self.browser_state
def _extract_page_statistics(self) -> dict[str, int]:
"""Extract high-level page statistics from DOM tree for LLM context"""
stats = {
'links': 0,
'iframes': 0,
'shadow_open': 0,
'shadow_closed': 0,
'scroll_containers': 0,
'images': 0,
'interactive_elements': 0,
'total_elements': 0,
}
if not self.browser_state.dom_state or not self.browser_state.dom_state._root:
return stats
def traverse_node(node: SimplifiedNode) -> None:
"""Recursively traverse simplified DOM tree to count elements"""
if not node or not node.original_node:
return
original = node.original_node
stats['total_elements'] += 1
# Count by node type and tag
if original.node_type == NodeType.ELEMENT_NODE:
tag = original.tag_name.lower() if original.tag_name else ''
if tag == 'a':
stats['links'] += 1
elif tag in ('iframe', 'frame'):
stats['iframes'] += 1
elif tag == 'img':
stats['images'] += 1
# Check if scrollable
if original.is_actually_scrollable:
stats['scroll_containers'] += 1
# Check if interactive
if node.interactive_index is not None:
stats['interactive_elements'] += 1
# Check if this element hosts shadow DOM
if node.is_shadow_host:
# Check if any shadow children are closed
has_closed_shadow = any(
child.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE
and child.original_node.shadow_root_type
and child.original_node.shadow_root_type.lower() == 'closed'
for child in node.children
)
if has_closed_shadow:
stats['shadow_closed'] += 1
else:
stats['shadow_open'] += 1
elif original.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM fragment - these are the actual shadow roots
# But don't double-count since we count them at the host level above
pass
# Traverse children
for child in node.children:
traverse_node(child)
traverse_node(self.browser_state.dom_state._root)
return stats
@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
def _get_browser_state_description(self) -> str:
# Extract page statistics first
page_stats = self._extract_page_statistics()
# Format statistics for LLM
stats_text = '<page_stats>'
if page_stats['total_elements'] < 10:
stats_text += 'Page appears empty (SPA not loaded?) - '
stats_text += f'{page_stats["links"]} links, {page_stats["interactive_elements"]} interactive, '
stats_text += f'{page_stats["iframes"]} iframes, {page_stats["scroll_containers"]} scroll containers'
if page_stats['shadow_open'] > 0 or page_stats['shadow_closed'] > 0:
stats_text += f', {page_stats["shadow_open"]} shadow(open), {page_stats["shadow_closed"]} shadow(closed)'
if page_stats['images'] > 0:
stats_text += f', {page_stats["images"]} images'
stats_text += f', {page_stats["total_elements"]} total elements'
stats_text += '</page_stats>\n\n'
elements_text = self.browser_state.dom_state.llm_representation(include_attributes=self.include_attributes)
if len(elements_text) > self.max_clickable_elements_length:
@@ -122,9 +208,8 @@ class AgentMessagePrompt:
else:
truncated_text = ''
has_content_above = (self.browser_state.pixels_above or 0) > 0
has_content_below = (self.browser_state.pixels_below or 0) > 0
has_content_above = False
has_content_below = False
# Enhanced page information for the model
page_info_text = ''
if self.browser_state.page_info:
@@ -132,10 +217,11 @@ class AgentMessagePrompt:
# Compute page statistics dynamically
pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
has_content_above = pages_above > 0
has_content_below = pages_below > 0
total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1)
page_info_text = '<page_info>'
page_info_text += f'Viewport size: {pi.viewport_width}x{pi.viewport_height}px, Total page size: {pi.page_width}x{pi.page_height}px, '
page_info_text += f'{pages_above:.1f} pages above, '
page_info_text += f'{pages_below:.1f} pages below, '
page_info_text += f'{total_pages:.1f} total pages'
@@ -146,18 +232,14 @@ class AgentMessagePrompt:
if self.browser_state.page_info:
pi = self.browser_state.page_info
pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
elements_text = f'... {self.browser_state.pixels_above} pixels above ({pages_above:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
else:
elements_text = f'... {self.browser_state.pixels_above} pixels above - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
elements_text = f'... {pages_above:.1f} pages above - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
else:
elements_text = f'[Start of page]\n{elements_text}'
if has_content_below:
if self.browser_state.page_info:
pi = self.browser_state.page_info
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below ({pages_below:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...'
else:
elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below - scroll to see more or extract structured data if you are looking for specific information ...'
elements_text = f'{elements_text}\n... {pages_below:.1f} pages below - scroll to see more or extract structured data if you are looking for specific information ...'
else:
elements_text = f'{elements_text}\n[End of page]'
else:
@@ -190,7 +272,7 @@ class AgentMessagePrompt:
if self.include_recent_events and self.browser_state.recent_events:
recent_events_text = f'Recent browser events: {self.browser_state.recent_events}\n'
browser_state = f"""{current_tab_text}
browser_state = f"""{stats_text}{current_tab_text}
Available tabs:
{tabs_text}
{page_info_text}
@@ -205,9 +287,6 @@ Available tabs:
else:
step_info_description = ''
time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
step_info_description += f'Current date and time: {time_str}'
time_str = datetime.now().strftime('%Y-%m-%d')
step_info_description += f'Current date: {time_str}'

View File

@@ -71,7 +71,7 @@ class DefaultActionWatchdog(BaseWatchdog):
msg = f'Downloaded file to {download_path}'
self.logger.info(f'💾 {msg}')
else:
msg = f'Clicked button with index {index_for_logging}: {element_node.get_all_children_text(max_depth=2)}'
msg = f'Clicked button {element_node.node_name}: {element_node.get_all_children_text(max_depth=2)}'
self.logger.debug(f'🖱️ {msg}')
self.logger.debug(f'Element xpath: {element_node.xpath}')
@@ -1912,7 +1912,7 @@ class DefaultActionWatchdog(BaseWatchdog):
self.logger.error(msg)
raise BrowserError(message=msg, long_term_memory=msg)
except Exception as e:
msg = f'Failed to get dropdown options for element with index {index_for_logging}'
msg = 'Failed to get dropdown options'
error_msg = f'{msg}: {str(e)}'
self.logger.error(error_msg)
raise BrowserError(

View File

@@ -137,13 +137,16 @@ class DOMTreeSerializer:
return None
if node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Super simple pass-through for shadow DOM elements
# ENHANCED shadow DOM processing - always include shadow content
simplified = SimplifiedNode(original_node=node, children=[])
for child in node.children_and_shadow_roots:
simplified_child = self._create_simplified_tree(child, depth + 1)
if simplified_child:
simplified.children.append(simplified_child)
return simplified
# Always return shadow DOM fragments, even if children seem empty
# Shadow DOM often contains the actual interactive content in SPAs
return simplified if simplified.children else SimplifiedNode(original_node=node, children=[])
elif node.node_type == NodeType.ELEMENT_NODE:
# Skip non-content elements
@@ -161,19 +164,26 @@ class DOMTreeSerializer:
is_visible = node.is_visible
is_scrollable = node.is_actually_scrollable
has_shadow_content = bool(node.children_and_shadow_roots)
# Include if interactive (regardless of visibility), or scrollable, or has children to process
# ENHANCED SHADOW DOM DETECTION: Include shadow hosts even if not visible
is_shadow_host = any(child.node_type == NodeType.DOCUMENT_FRAGMENT_NODE for child in node.children_and_shadow_roots)
if is_visible or is_scrollable or bool(node.children_and_shadow_roots):
simplified = SimplifiedNode(original_node=node, children=[])
# simplified._analysis = analysis # Store analysis for grouping
# Include if interactive (regardless of visibility), scrollable, has children, or is shadow host
if is_visible or is_scrollable or has_shadow_content or is_shadow_host:
simplified = SimplifiedNode(original_node=node, children=[], is_shadow_host=is_shadow_host)
# Process children
# Process ALL children including shadow roots with enhanced logging
for child in node.children_and_shadow_roots:
simplified_child = self._create_simplified_tree(child, depth + 1)
if simplified_child:
simplified.children.append(simplified_child)
# SHADOW DOM SPECIAL CASE: Always include shadow hosts even if not visible
# Many SPA frameworks (React, Vue) render content in shadow DOM
if is_shadow_host and simplified.children:
return simplified
# Return if meaningful or has meaningful children
if is_visible or is_scrollable or simplified.children:
return simplified
@@ -449,23 +459,34 @@ class DOMTreeSerializer:
# Build attributes string
attributes_html_str = DOMTreeSerializer._build_attributes_string(node.original_node, include_attributes, '')
# Build the line
# Build the line with shadow host indicator
shadow_prefix = ''
if node.is_shadow_host:
# Check if any shadow children are closed
has_closed_shadow = any(
child.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE
and child.original_node.shadow_root_type
and child.original_node.shadow_root_type.lower() == 'closed'
for child in node.children
)
shadow_prefix = '|SHADOW(closed)|' if has_closed_shadow else '|SHADOW(open)|'
if should_show_scroll and node.interactive_index is None:
# Scrollable container but not clickable
line = f'{depth_str}|SCROLL|<{node.original_node.tag_name}'
line = f'{depth_str}{shadow_prefix}|SCROLL|<{node.original_node.tag_name}'
elif node.interactive_index is not None:
# Clickable (and possibly scrollable)
new_prefix = '*' if node.is_new else ''
scroll_prefix = '|SCROLL+' if should_show_scroll else '['
line = f'{depth_str}{new_prefix}{scroll_prefix}{node.interactive_index}]<{node.original_node.tag_name}'
line = f'{depth_str}{shadow_prefix}{new_prefix}{scroll_prefix}{node.interactive_index}]<{node.original_node.tag_name}'
elif node.original_node.tag_name.upper() == 'IFRAME':
# Iframe element (not interactive)
line = f'{depth_str}|IFRAME|<{node.original_node.tag_name}'
line = f'{depth_str}{shadow_prefix}|IFRAME|<{node.original_node.tag_name}'
elif node.original_node.tag_name.upper() == 'FRAME':
# Frame element (not interactive)
line = f'{depth_str}|FRAME|<{node.original_node.tag_name}'
line = f'{depth_str}{shadow_prefix}|FRAME|<{node.original_node.tag_name}'
else:
line = f'{depth_str}<{node.original_node.tag_name}'
line = f'{depth_str}{shadow_prefix}<{node.original_node.tag_name}'
if attributes_html_str:
line += f' {attributes_html_str}'
@@ -480,6 +501,25 @@ class DOMTreeSerializer:
formatted_text.append(line)
elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM representation - show clearly to LLM
if node.original_node.shadow_root_type and node.original_node.shadow_root_type.lower() == 'closed':
formatted_text.append(f'{depth_str}▼ Shadow Content (Closed)')
else:
formatted_text.append(f'{depth_str}▼ Shadow Content (Open)')
next_depth += 1
# Process shadow DOM children
for child in node.children:
child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth)
if child_text:
formatted_text.append(child_text)
# Close shadow DOM indicator
if node.children: # Only show close if we had content
formatted_text.append(f'{depth_str}▲ Shadow Content End')
elif node.original_node.node_type == NodeType.TEXT_NODE:
# Include visible text
is_visible = node.original_node.snapshot_node and node.original_node.is_visible
@@ -492,11 +532,12 @@ class DOMTreeSerializer:
clean_text = node.original_node.node_value.strip()
formatted_text.append(f'{depth_str}{clean_text}')
# Process children
for child in node.children:
child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth)
if child_text:
formatted_text.append(child_text)
# Process children (for non-shadow elements)
if node.original_node.node_type != NodeType.DOCUMENT_FRAGMENT_NODE:
for child in node.children:
child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth)
if child_text:
formatted_text.append(child_text)
return '\n'.join(formatted_text)

View File

@@ -19,6 +19,8 @@ DEFAULT_INCLUDE_ATTRIBUTES = [
'title',
'type',
'checked',
# 'class',
'id',
'name',
'role',
'value',
@@ -51,6 +53,51 @@ DEFAULT_INCLUDE_ATTRIBUTES = [
'ax_name',
]
STATIC_ATTRIBUTES = {
'class',
'id',
'name',
'type',
'placeholder',
'aria-label',
'title',
# 'aria-expanded',
'role',
'data-testid',
'data-test',
'data-cy',
'data-selenium',
'for',
'required',
'disabled',
'readonly',
'checked',
'selected',
'multiple',
'href',
'target',
'rel',
'aria-describedby',
'aria-labelledby',
'aria-controls',
'aria-owns',
'aria-live',
'aria-atomic',
'aria-busy',
'aria-disabled',
'aria-hidden',
'aria-pressed',
'aria-checked',
'aria-selected',
'tabindex',
'alt',
'src',
'lang',
'itemscope',
'itemtype',
'itemprop',
}
@dataclass
class CurrentPageTargets:
@@ -93,6 +140,7 @@ class SimplifiedNode:
ignored_by_paint_order: bool = False # More info in dom/serializer/paint_order.py
excluded_by_parent: bool = False # New field for bbox filtering
is_shadow_host: bool = False # New field for shadow DOM hosts
def _clean_original_node_json(self, node_json: dict) -> dict:
"""Recursively remove children_nodes and shadow_roots from original_node JSON."""
@@ -683,8 +731,9 @@ class EnhancedDOMTreeNode:
parent_branch_path = self._get_parent_branch_path()
parent_branch_path_string = '/'.join(parent_branch_path)
# Get attributes hash
attributes_string = ''.join(f'{key}={value}' for key, value in self.attributes.items())
attributes_string = ''.join(
f'{k}={v}' for k, v in sorted((k, v) for k, v in self.attributes.items() if k in STATIC_ATTRIBUTES)
)
# Combine both for final hash
combined_string = f'{parent_branch_path_string}|{attributes_string}'

View File

@@ -113,14 +113,14 @@ class CloudSync:
f'Failed to send sync event: POST {response.request.url} {response.status_code} - {response.text}'
)
except httpx.TimeoutException:
logger.warning(f'Event send timed out after 10 seconds: {event}')
logger.debug(f'Event send timed out after 10 seconds: {event}')
except httpx.ConnectError as e:
# logger.warning(f'⚠️ Failed to connect to cloud service at {self.base_url}: {e}')
pass
except httpx.HTTPError as e:
logger.warning(f'HTTP error sending event {event}: {type(e).__name__}: {e}')
logger.debug(f'HTTP error sending event {event}: {type(e).__name__}: {e}')
except Exception as e:
logger.warning(f'Unexpected error sending event {event}: {type(e).__name__}: {e}')
logger.debug(f'Unexpected error sending event {event}: {type(e).__name__}: {e}')
async def _background_auth(self, agent_session_id: str) -> None:
"""Run authentication in background or show cloud URL if already authenticated"""

View File

@@ -293,7 +293,7 @@ class Tools(Generic[Context]):
await event
# Wait for handler to complete and get any exception or metadata
click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
memory = f'Clicked element with index {params.index}'
memory = 'Clicked element'
if params.while_holding_ctrl:
memory += ' and opened in new tab'

View File

@@ -24,43 +24,17 @@ from browser_use.llm.openai.chat import ChatOpenAI
async def main():
# Example task to demonstrate history saving and rerunning
task = 'Go to GitHub and find the browser-use repository'
history_file = Path('agent_history.json')
task = 'Go to https://browser-use.github.io/stress-tests/challenges/ember-form.html and fill the form with example data.'
llm = ChatOpenAI(model='gpt-4.1-mini')
# Step 1: Run agent and save history
print('🚀 Running agent and saving history...')
agent = Agent(
task=task,
llm=llm,
)
# Run the agent
history = await agent.run(max_steps=5)
# Save the history for later rerun
agent = Agent(task=task, llm=llm, max_actions_per_step=1)
await agent.run(max_steps=5)
agent.save_history(history_file)
print(f'✅ History saved to {history_file}')
print(f'📊 Completed {len(history.history)} steps')
rerun_agent = Agent(task='', llm=llm)
# Step 2: Load and rerun the history
print('\n🔄 Loading and rerunning history...')
# Create new agent for rerunning (task can be empty since we're replaying)
rerun_agent = Agent(
task='',
llm=llm,
)
# Load and rerun the saved history
results = await rerun_agent.load_and_rerun(
history_file=history_file,
max_retries=3, # Retry failed actions up to 3 times
skip_failures=True, # Continue even if some actions fail
delay_between_actions=1.0, # Wait 1 second between actions
)
await rerun_agent.load_and_rerun(history_file)
if __name__ == '__main__':

View File

@@ -188,9 +188,7 @@ class TestClickElementEvent:
result_text = result.extracted_content or result.long_term_memory
# Core logic validation: Verify click was successful
assert result_text is not None
assert f'Clicked element with index {button_index}' in result_text, (
f'Expected click confirmation in result content, got: {result_text}'
)
assert 'Clicked element' in result_text, f'Expected click confirmation in result content, got: {result_text}'
# Note: The click action doesn't include button text in the result, only the index
# Verify the click actually had an effect on the page using CDP
@@ -262,9 +260,7 @@ class TestClickElementEvent:
assert isinstance(result, ActionResult)
result_text = result.extracted_content or result.long_term_memory
assert result_text is not None
assert f'Clicked element with index {link_index}' in result_text, (
f'Expected click confirmation in result content, got: {result_text}'
)
assert 'Clicked element' in result_text, f'Expected click confirmation in result content, got: {result_text}'
# Verify that a new tab was opened
tabs = await browser_session.get_tabs()