diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py index 33a545fb2..8c7aa7348 100644 --- a/browser_use/agent/prompts.py +++ b/browser_use/agent/prompts.py @@ -2,6 +2,7 @@ import importlib.resources from datetime import datetime from typing import TYPE_CHECKING, Literal, Optional +from browser_use.dom.views import NodeType, SimplifiedNode from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL, SystemMessage, UserMessage from browser_use.observability import observe_debug from browser_use.utils import is_new_tab_page @@ -112,8 +113,93 @@ class AgentMessagePrompt: self.sample_images = sample_images or [] assert self.browser_state + def _extract_page_statistics(self) -> dict[str, int]: + """Extract high-level page statistics from DOM tree for LLM context""" + stats = { + 'links': 0, + 'iframes': 0, + 'shadow_open': 0, + 'shadow_closed': 0, + 'scroll_containers': 0, + 'images': 0, + 'interactive_elements': 0, + 'total_elements': 0, + } + + if not self.browser_state.dom_state or not self.browser_state.dom_state._root: + return stats + + def traverse_node(node: SimplifiedNode) -> None: + """Recursively traverse simplified DOM tree to count elements""" + if not node or not node.original_node: + return + + original = node.original_node + stats['total_elements'] += 1 + + # Count by node type and tag + if original.node_type == NodeType.ELEMENT_NODE: + tag = original.tag_name.lower() if original.tag_name else '' + + if tag == 'a': + stats['links'] += 1 + elif tag in ('iframe', 'frame'): + stats['iframes'] += 1 + elif tag == 'img': + stats['images'] += 1 + + # Check if scrollable + if original.is_actually_scrollable: + stats['scroll_containers'] += 1 + + # Check if interactive + if node.interactive_index is not None: + stats['interactive_elements'] += 1 + + # Check if this element hosts shadow DOM + if node.is_shadow_host: + # Check if any shadow children are closed + has_closed_shadow = any( + child.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE + and child.original_node.shadow_root_type + and child.original_node.shadow_root_type.lower() == 'closed' + for child in node.children + ) + if has_closed_shadow: + stats['shadow_closed'] += 1 + else: + stats['shadow_open'] += 1 + + elif original.node_type == NodeType.DOCUMENT_FRAGMENT_NODE: + # Shadow DOM fragment - these are the actual shadow roots + # But don't double-count since we count them at the host level above + pass + + # Traverse children + for child in node.children: + traverse_node(child) + + traverse_node(self.browser_state.dom_state._root) + return stats + @observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description') def _get_browser_state_description(self) -> str: + # Extract page statistics first + page_stats = self._extract_page_statistics() + + # Format statistics for LLM + stats_text = '' + if page_stats['total_elements'] < 10: + stats_text += 'Page appears empty (SPA not loaded?) - ' + stats_text += f'{page_stats["links"]} links, {page_stats["interactive_elements"]} interactive, ' + stats_text += f'{page_stats["iframes"]} iframes, {page_stats["scroll_containers"]} scroll containers' + if page_stats['shadow_open'] > 0 or page_stats['shadow_closed'] > 0: + stats_text += f', {page_stats["shadow_open"]} shadow(open), {page_stats["shadow_closed"]} shadow(closed)' + if page_stats['images'] > 0: + stats_text += f', {page_stats["images"]} images' + stats_text += f', {page_stats["total_elements"]} total elements' + stats_text += '\n\n' + elements_text = self.browser_state.dom_state.llm_representation(include_attributes=self.include_attributes) if len(elements_text) > self.max_clickable_elements_length: @@ -122,9 +208,8 @@ class AgentMessagePrompt: else: truncated_text = '' - has_content_above = (self.browser_state.pixels_above or 0) > 0 - has_content_below = (self.browser_state.pixels_below or 0) > 0 - + has_content_above = False + has_content_below = False # Enhanced page information for the model page_info_text = '' if self.browser_state.page_info: @@ -132,10 +217,11 @@ class AgentMessagePrompt: # Compute page statistics dynamically pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0 pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0 + has_content_above = pages_above > 0 + has_content_below = pages_below > 0 total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0 current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1) page_info_text = '' - page_info_text += f'Viewport size: {pi.viewport_width}x{pi.viewport_height}px, Total page size: {pi.page_width}x{pi.page_height}px, ' page_info_text += f'{pages_above:.1f} pages above, ' page_info_text += f'{pages_below:.1f} pages below, ' page_info_text += f'{total_pages:.1f} total pages' @@ -146,18 +232,14 @@ class AgentMessagePrompt: if self.browser_state.page_info: pi = self.browser_state.page_info pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0 - elements_text = f'... {self.browser_state.pixels_above} pixels above ({pages_above:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}' - else: - elements_text = f'... {self.browser_state.pixels_above} pixels above - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}' + elements_text = f'... {pages_above:.1f} pages above - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}' else: elements_text = f'[Start of page]\n{elements_text}' if has_content_below: if self.browser_state.page_info: pi = self.browser_state.page_info pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0 - elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below ({pages_below:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...' - else: - elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below - scroll to see more or extract structured data if you are looking for specific information ...' + elements_text = f'{elements_text}\n... {pages_below:.1f} pages below - scroll to see more or extract structured data if you are looking for specific information ...' else: elements_text = f'{elements_text}\n[End of page]' else: @@ -190,7 +272,7 @@ class AgentMessagePrompt: if self.include_recent_events and self.browser_state.recent_events: recent_events_text = f'Recent browser events: {self.browser_state.recent_events}\n' - browser_state = f"""{current_tab_text} + browser_state = f"""{stats_text}{current_tab_text} Available tabs: {tabs_text} {page_info_text} @@ -205,9 +287,6 @@ Available tabs: else: step_info_description = '' - time_str = datetime.now().strftime('%Y-%m-%d %H:%M') - step_info_description += f'Current date and time: {time_str}' - time_str = datetime.now().strftime('%Y-%m-%d') step_info_description += f'Current date: {time_str}' diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index dc7fcd241..02d252ced 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -71,7 +71,7 @@ class DefaultActionWatchdog(BaseWatchdog): msg = f'Downloaded file to {download_path}' self.logger.info(f'šŸ’¾ {msg}') else: - msg = f'Clicked button with index {index_for_logging}: {element_node.get_all_children_text(max_depth=2)}' + msg = f'Clicked button {element_node.node_name}: {element_node.get_all_children_text(max_depth=2)}' self.logger.debug(f'šŸ–±ļø {msg}') self.logger.debug(f'Element xpath: {element_node.xpath}') @@ -1912,7 +1912,7 @@ class DefaultActionWatchdog(BaseWatchdog): self.logger.error(msg) raise BrowserError(message=msg, long_term_memory=msg) except Exception as e: - msg = f'Failed to get dropdown options for element with index {index_for_logging}' + msg = 'Failed to get dropdown options' error_msg = f'{msg}: {str(e)}' self.logger.error(error_msg) raise BrowserError( diff --git a/browser_use/dom/serializer/serializer.py b/browser_use/dom/serializer/serializer.py index 436faf20e..1b199965d 100644 --- a/browser_use/dom/serializer/serializer.py +++ b/browser_use/dom/serializer/serializer.py @@ -137,13 +137,16 @@ class DOMTreeSerializer: return None if node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE: - # Super simple pass-through for shadow DOM elements + # ENHANCED shadow DOM processing - always include shadow content simplified = SimplifiedNode(original_node=node, children=[]) for child in node.children_and_shadow_roots: simplified_child = self._create_simplified_tree(child, depth + 1) if simplified_child: simplified.children.append(simplified_child) - return simplified + + # Always return shadow DOM fragments, even if children seem empty + # Shadow DOM often contains the actual interactive content in SPAs + return simplified if simplified.children else SimplifiedNode(original_node=node, children=[]) elif node.node_type == NodeType.ELEMENT_NODE: # Skip non-content elements @@ -161,19 +164,26 @@ class DOMTreeSerializer: is_visible = node.is_visible is_scrollable = node.is_actually_scrollable + has_shadow_content = bool(node.children_and_shadow_roots) - # Include if interactive (regardless of visibility), or scrollable, or has children to process + # ENHANCED SHADOW DOM DETECTION: Include shadow hosts even if not visible + is_shadow_host = any(child.node_type == NodeType.DOCUMENT_FRAGMENT_NODE for child in node.children_and_shadow_roots) - if is_visible or is_scrollable or bool(node.children_and_shadow_roots): - simplified = SimplifiedNode(original_node=node, children=[]) - # simplified._analysis = analysis # Store analysis for grouping + # Include if interactive (regardless of visibility), scrollable, has children, or is shadow host + if is_visible or is_scrollable or has_shadow_content or is_shadow_host: + simplified = SimplifiedNode(original_node=node, children=[], is_shadow_host=is_shadow_host) - # Process children + # Process ALL children including shadow roots with enhanced logging for child in node.children_and_shadow_roots: simplified_child = self._create_simplified_tree(child, depth + 1) if simplified_child: simplified.children.append(simplified_child) + # SHADOW DOM SPECIAL CASE: Always include shadow hosts even if not visible + # Many SPA frameworks (React, Vue) render content in shadow DOM + if is_shadow_host and simplified.children: + return simplified + # Return if meaningful or has meaningful children if is_visible or is_scrollable or simplified.children: return simplified @@ -449,23 +459,34 @@ class DOMTreeSerializer: # Build attributes string attributes_html_str = DOMTreeSerializer._build_attributes_string(node.original_node, include_attributes, '') - # Build the line + # Build the line with shadow host indicator + shadow_prefix = '' + if node.is_shadow_host: + # Check if any shadow children are closed + has_closed_shadow = any( + child.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE + and child.original_node.shadow_root_type + and child.original_node.shadow_root_type.lower() == 'closed' + for child in node.children + ) + shadow_prefix = '|SHADOW(closed)|' if has_closed_shadow else '|SHADOW(open)|' + if should_show_scroll and node.interactive_index is None: # Scrollable container but not clickable - line = f'{depth_str}|SCROLL|<{node.original_node.tag_name}' + line = f'{depth_str}{shadow_prefix}|SCROLL|<{node.original_node.tag_name}' elif node.interactive_index is not None: # Clickable (and possibly scrollable) new_prefix = '*' if node.is_new else '' scroll_prefix = '|SCROLL+' if should_show_scroll else '[' - line = f'{depth_str}{new_prefix}{scroll_prefix}{node.interactive_index}]<{node.original_node.tag_name}' + line = f'{depth_str}{shadow_prefix}{new_prefix}{scroll_prefix}{node.interactive_index}]<{node.original_node.tag_name}' elif node.original_node.tag_name.upper() == 'IFRAME': # Iframe element (not interactive) - line = f'{depth_str}|IFRAME|<{node.original_node.tag_name}' + line = f'{depth_str}{shadow_prefix}|IFRAME|<{node.original_node.tag_name}' elif node.original_node.tag_name.upper() == 'FRAME': # Frame element (not interactive) - line = f'{depth_str}|FRAME|<{node.original_node.tag_name}' + line = f'{depth_str}{shadow_prefix}|FRAME|<{node.original_node.tag_name}' else: - line = f'{depth_str}<{node.original_node.tag_name}' + line = f'{depth_str}{shadow_prefix}<{node.original_node.tag_name}' if attributes_html_str: line += f' {attributes_html_str}' @@ -480,6 +501,25 @@ class DOMTreeSerializer: formatted_text.append(line) + elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE: + # Shadow DOM representation - show clearly to LLM + if node.original_node.shadow_root_type and node.original_node.shadow_root_type.lower() == 'closed': + formatted_text.append(f'{depth_str}ā–¼ Shadow Content (Closed)') + else: + formatted_text.append(f'{depth_str}ā–¼ Shadow Content (Open)') + + next_depth += 1 + + # Process shadow DOM children + for child in node.children: + child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth) + if child_text: + formatted_text.append(child_text) + + # Close shadow DOM indicator + if node.children: # Only show close if we had content + formatted_text.append(f'{depth_str}ā–² Shadow Content End') + elif node.original_node.node_type == NodeType.TEXT_NODE: # Include visible text is_visible = node.original_node.snapshot_node and node.original_node.is_visible @@ -492,11 +532,12 @@ class DOMTreeSerializer: clean_text = node.original_node.node_value.strip() formatted_text.append(f'{depth_str}{clean_text}') - # Process children - for child in node.children: - child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth) - if child_text: - formatted_text.append(child_text) + # Process children (for non-shadow elements) + if node.original_node.node_type != NodeType.DOCUMENT_FRAGMENT_NODE: + for child in node.children: + child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth) + if child_text: + formatted_text.append(child_text) return '\n'.join(formatted_text) diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py index cbcaadaa1..061b070a8 100644 --- a/browser_use/dom/views.py +++ b/browser_use/dom/views.py @@ -19,6 +19,8 @@ DEFAULT_INCLUDE_ATTRIBUTES = [ 'title', 'type', 'checked', + # 'class', + 'id', 'name', 'role', 'value', @@ -51,6 +53,51 @@ DEFAULT_INCLUDE_ATTRIBUTES = [ 'ax_name', ] +STATIC_ATTRIBUTES = { + 'class', + 'id', + 'name', + 'type', + 'placeholder', + 'aria-label', + 'title', + # 'aria-expanded', + 'role', + 'data-testid', + 'data-test', + 'data-cy', + 'data-selenium', + 'for', + 'required', + 'disabled', + 'readonly', + 'checked', + 'selected', + 'multiple', + 'href', + 'target', + 'rel', + 'aria-describedby', + 'aria-labelledby', + 'aria-controls', + 'aria-owns', + 'aria-live', + 'aria-atomic', + 'aria-busy', + 'aria-disabled', + 'aria-hidden', + 'aria-pressed', + 'aria-checked', + 'aria-selected', + 'tabindex', + 'alt', + 'src', + 'lang', + 'itemscope', + 'itemtype', + 'itemprop', +} + @dataclass class CurrentPageTargets: @@ -93,6 +140,7 @@ class SimplifiedNode: ignored_by_paint_order: bool = False # More info in dom/serializer/paint_order.py excluded_by_parent: bool = False # New field for bbox filtering + is_shadow_host: bool = False # New field for shadow DOM hosts def _clean_original_node_json(self, node_json: dict) -> dict: """Recursively remove children_nodes and shadow_roots from original_node JSON.""" @@ -683,8 +731,9 @@ class EnhancedDOMTreeNode: parent_branch_path = self._get_parent_branch_path() parent_branch_path_string = '/'.join(parent_branch_path) - # Get attributes hash - attributes_string = ''.join(f'{key}={value}' for key, value in self.attributes.items()) + attributes_string = ''.join( + f'{k}={v}' for k, v in sorted((k, v) for k, v in self.attributes.items() if k in STATIC_ATTRIBUTES) + ) # Combine both for final hash combined_string = f'{parent_branch_path_string}|{attributes_string}' diff --git a/browser_use/sync/service.py b/browser_use/sync/service.py index b4eb24872..f046d2831 100644 --- a/browser_use/sync/service.py +++ b/browser_use/sync/service.py @@ -113,14 +113,14 @@ class CloudSync: f'Failed to send sync event: POST {response.request.url} {response.status_code} - {response.text}' ) except httpx.TimeoutException: - logger.warning(f'Event send timed out after 10 seconds: {event}') + logger.debug(f'Event send timed out after 10 seconds: {event}') except httpx.ConnectError as e: # logger.warning(f'āš ļø Failed to connect to cloud service at {self.base_url}: {e}') pass except httpx.HTTPError as e: - logger.warning(f'HTTP error sending event {event}: {type(e).__name__}: {e}') + logger.debug(f'HTTP error sending event {event}: {type(e).__name__}: {e}') except Exception as e: - logger.warning(f'Unexpected error sending event {event}: {type(e).__name__}: {e}') + logger.debug(f'Unexpected error sending event {event}: {type(e).__name__}: {e}') async def _background_auth(self, agent_session_id: str) -> None: """Run authentication in background or show cloud URL if already authenticated""" diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 19c37cca2..9caf2d7b3 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -293,7 +293,7 @@ class Tools(Generic[Context]): await event # Wait for handler to complete and get any exception or metadata click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False) - memory = f'Clicked element with index {params.index}' + memory = 'Clicked element' if params.while_holding_ctrl: memory += ' and opened in new tab' diff --git a/examples/features/rerun_history.py b/examples/features/rerun_history.py index dac51b4c1..3696d6fcd 100644 --- a/examples/features/rerun_history.py +++ b/examples/features/rerun_history.py @@ -24,43 +24,17 @@ from browser_use.llm.openai.chat import ChatOpenAI async def main(): # Example task to demonstrate history saving and rerunning - task = 'Go to GitHub and find the browser-use repository' history_file = Path('agent_history.json') + task = 'Go to https://browser-use.github.io/stress-tests/challenges/ember-form.html and fill the form with example data.' llm = ChatOpenAI(model='gpt-4.1-mini') - # Step 1: Run agent and save history - print('šŸš€ Running agent and saving history...') - - agent = Agent( - task=task, - llm=llm, - ) - - # Run the agent - history = await agent.run(max_steps=5) - - # Save the history for later rerun + agent = Agent(task=task, llm=llm, max_actions_per_step=1) + await agent.run(max_steps=5) agent.save_history(history_file) - print(f'āœ… History saved to {history_file}') - print(f'šŸ“Š Completed {len(history.history)} steps') + rerun_agent = Agent(task='', llm=llm) - # Step 2: Load and rerun the history - print('\nšŸ”„ Loading and rerunning history...') - - # Create new agent for rerunning (task can be empty since we're replaying) - rerun_agent = Agent( - task='', - llm=llm, - ) - - # Load and rerun the saved history - results = await rerun_agent.load_and_rerun( - history_file=history_file, - max_retries=3, # Retry failed actions up to 3 times - skip_failures=True, # Continue even if some actions fail - delay_between_actions=1.0, # Wait 1 second between actions - ) + await rerun_agent.load_and_rerun(history_file) if __name__ == '__main__': diff --git a/tests/ci/test_browser_event_ClickElementEvent.py b/tests/ci/test_browser_event_ClickElementEvent.py index 08265ac98..6a8b62684 100644 --- a/tests/ci/test_browser_event_ClickElementEvent.py +++ b/tests/ci/test_browser_event_ClickElementEvent.py @@ -188,9 +188,7 @@ class TestClickElementEvent: result_text = result.extracted_content or result.long_term_memory # Core logic validation: Verify click was successful assert result_text is not None - assert f'Clicked element with index {button_index}' in result_text, ( - f'Expected click confirmation in result content, got: {result_text}' - ) + assert 'Clicked element' in result_text, f'Expected click confirmation in result content, got: {result_text}' # Note: The click action doesn't include button text in the result, only the index # Verify the click actually had an effect on the page using CDP @@ -262,9 +260,7 @@ class TestClickElementEvent: assert isinstance(result, ActionResult) result_text = result.extracted_content or result.long_term_memory assert result_text is not None - assert f'Clicked element with index {link_index}' in result_text, ( - f'Expected click confirmation in result content, got: {result_text}' - ) + assert 'Clicked element' in result_text, f'Expected click confirmation in result content, got: {result_text}' # Verify that a new tab was opened tabs = await browser_session.get_tabs()