From 6ad7e708d8e6b319d0dedfeba28a14c050947b24 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 17 Aug 2025 22:39:32 +0000 Subject: [PATCH 01/69] Add Python-based screenshot highlighting with element bounding boxes Co-authored-by: mamagnus00 --- browser_use/browser/dom_watchdog.py | 118 +++++------ browser_use/browser/python_highlights.py | 245 +++++++++++++++++++++++ 2 files changed, 306 insertions(+), 57 deletions(-) create mode 100644 browser_use/browser/python_highlights.py diff --git a/browser_use/browser/dom_watchdog.py b/browser_use/browser/dom_watchdog.py index dc0992715..82d5144a0 100644 --- a/browser_use/browser/dom_watchdog.py +++ b/browser_use/browser/dom_watchdog.py @@ -232,65 +232,69 @@ class DOMWatchdog(BaseWatchdog): recent_events=self._get_recent_events_str() if event.include_recent_events else None, ) - # Normal path: Build DOM tree if requested - if event.include_dom: - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Building DOM tree...') - - # Build the DOM directly using the internal method - previous_state = ( - self.browser_session._cached_browser_state_summary.dom_state - if self.browser_session._cached_browser_state_summary - else None - ) - - try: - # Call the DOM building method directly - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: Starting _build_dom_tree...') - content = await self._build_dom_tree(previous_state) - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ _build_dom_tree completed') - except Exception as e: - self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: DOM build failed: {e}, using minimal state') - content = SerializedDOMState(_root=None, selector_map={}) - - if not content: - # Fallback to minimal DOM state - self.logger.warning('DOM build returned no content, using minimal state') - content = SerializedDOMState(_root=None, selector_map={}) - else: - # Skip DOM building if not requested + # Execute DOM building and screenshot capture in parallel + dom_task = None + screenshot_task = None + + # Start DOM building task if requested + if event.include_dom: + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Starting DOM tree build task...') + + previous_state = ( + self.browser_session._cached_browser_state_summary.dom_state + if self.browser_session._cached_browser_state_summary + else None + ) + + dom_task = asyncio.create_task(self._build_dom_tree_without_highlights(previous_state)) + + # Start clean screenshot task if requested (without JS highlights) + if event.include_screenshot: + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 Starting clean screenshot task...') + screenshot_task = asyncio.create_task(self._capture_clean_screenshot()) + + # Wait for both tasks to complete + content = None + screenshot_b64 = None + + if dom_task: + try: + content = await dom_task + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ DOM tree build completed') + except Exception as e: + self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: DOM build failed: {e}, using minimal state') content = SerializedDOMState(_root=None, selector_map={}) - - # re-focus top-level page session context - assert self.browser_session.agent_focus is not None, 'No current target ID' - await self.browser_session.get_or_create_cdp_session(target_id=self.browser_session.agent_focus.target_id, focus=True) - - # Get screenshot if requested - screenshot_b64 = None - if event.include_screenshot: - self.logger.debug( - f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 DOM watchdog requesting screenshot, include_screenshot={event.include_screenshot}' + else: + content = SerializedDOMState(_root=None, selector_map={}) + + if screenshot_task: + try: + screenshot_b64 = await screenshot_task + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Clean screenshot captured') + except Exception as e: + self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Clean screenshot failed: {e}') + screenshot_b64 = None + + # Apply Python-based highlighting if both DOM and screenshot are available + if (screenshot_b64 and content and content.selector_map and + self.browser_session.browser_profile.highlight_elements): + try: + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🎨 Applying Python-based highlighting...') + from browser_use.browser.python_highlights import create_highlighted_screenshot_async + + # Get CDP session for viewport info + cdp_session = await self.browser_session.get_or_create_cdp_session() + + screenshot_b64 = await create_highlighted_screenshot_async( + screenshot_b64, content.selector_map, cdp_session ) - try: - # Check if handler is registered - handlers = self.event_bus.handlers.get('ScreenshotEvent', []) - handler_names = [getattr(h, '__name__', str(h)) for h in handlers] - self.logger.debug(f'📸 ScreenshotEvent handlers registered: {len(handlers)} - {handler_names}') - - screenshot_event = self.event_bus.dispatch(ScreenshotEvent(full_page=False)) - self.logger.debug('📸 Dispatched ScreenshotEvent, waiting for event to complete...') - - # Wait for the event itself to complete (this waits for all handlers) - await screenshot_event - - # Get the single handler result - screenshot_b64 = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True) - except TimeoutError: - self.logger.warning('📸 Screenshot timed out after 6 seconds - no handler registered or slow page?') - - except Exception as e: - self.logger.warning(f'📸 Screenshot failed: {type(e).__name__}: {e}') - else: - self.logger.debug(f'📸 Skipping screenshot, include_screenshot={event.include_screenshot}') + self.logger.debug(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements') + except Exception as e: + self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}') + + # Ensure we have valid content + if not content: + content = SerializedDOMState(_root=None, selector_map={}) # Tabs info already fetched at the beginning diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py new file mode 100644 index 000000000..357532c66 --- /dev/null +++ b/browser_use/browser/python_highlights.py @@ -0,0 +1,245 @@ +"""Python-based highlighting system for drawing bounding boxes on screenshots. + +This module replaces JavaScript-based highlighting with fast Python image processing +to draw bounding boxes around interactive elements directly on screenshots. +""" + +import base64 +import io +from typing import Dict, Any, Optional, Tuple +from PIL import Image, ImageDraw, ImageFont +import logging + +from browser_use.dom.serializer.types import DOMSelectorMap + +logger = logging.getLogger(__name__) + +# Color scheme for different element types +ELEMENT_COLORS = { + 'button': '#FF6B6B', # Red for buttons + 'input': '#4ECDC4', # Teal for inputs + 'select': '#45B7D1', # Blue for dropdowns + 'a': '#96CEB4', # Green for links + 'textarea': '#FFEAA7', # Yellow for text areas + 'default': '#DDA0DD', # Light purple for other interactive elements +} + +# Element type mappings +ELEMENT_TYPE_MAP = { + 'button': 'button', + 'input': 'input', + 'select': 'select', + 'a': 'a', + 'textarea': 'textarea', +} + +def get_element_color(tag_name: str, element_type: Optional[str] = None) -> str: + """Get color for element based on tag name and type.""" + # Check input type first + if tag_name == 'input' and element_type: + if element_type in ['button', 'submit']: + return ELEMENT_COLORS['button'] + + # Use tag-based color + return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default']) + +def should_show_text_overlay(text: Optional[str]) -> bool: + """Determine if text overlay should be shown based on length.""" + if not text: + return False + return len(text.strip()) <= 10 + +def draw_bounding_box_with_text( + draw: ImageDraw.Draw, + bbox: Tuple[int, int, int, int], + color: str, + text: Optional[str] = None, + font: Optional[ImageFont.FreeTypeFont] = None +) -> None: + """Draw a bounding box with optional text overlay.""" + x1, y1, x2, y2 = bbox + + # Draw bounding box with 2px width + for i in range(2): + draw.rectangle([x1 + i, y1 + i, x2 - i, y2 - i], outline=color, fill=None) + + # Draw text overlay if provided and short enough + if text and should_show_text_overlay(text): + try: + # Get text size + if font: + bbox_text = draw.textbbox((0, 0), text, font=font) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + else: + # Fallback for default font + bbox_text = draw.textbbox((0, 0), text) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + + # Position text at top-left of bounding box + text_x = max(0, x1) + text_y = max(0, y1 - text_height - 2) # Above the box + + # Draw background rectangle for text + draw.rectangle( + [text_x - 2, text_y - 2, text_x + text_width + 2, text_y + text_height + 2], + fill=color, + outline=None + ) + + # Draw text + draw.text((text_x, text_y), text, fill='white', font=font) + + except Exception as e: + logger.debug(f"Failed to draw text overlay: {e}") + +def create_highlighted_screenshot( + screenshot_b64: str, + selector_map: DOMSelectorMap, + device_pixel_ratio: float = 1.0, + viewport_offset_x: int = 0, + viewport_offset_y: int = 0 +) -> str: + """Create a highlighted screenshot with bounding boxes around interactive elements. + + Args: + screenshot_b64: Base64 encoded screenshot + selector_map: Map of interactive elements with their positions + device_pixel_ratio: Device pixel ratio for scaling coordinates + viewport_offset_x: X offset for viewport positioning + viewport_offset_y: Y offset for viewport positioning + + Returns: + Base64 encoded highlighted screenshot + """ + try: + # Decode screenshot + screenshot_data = base64.b64decode(screenshot_b64) + image = Image.open(io.BytesIO(screenshot_data)).convert('RGBA') + + # Create drawing context + draw = ImageDraw.Draw(image) + + # Try to load a font, fall back to default if not available + font = None + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 12) + except: + try: + font = ImageFont.truetype("arial.ttf", 12) + except: + font = None # Use default font + + # Process each interactive element + for element_id, element in selector_map.items(): + try: + # Get element bounds + bounds = element.bounds + if not bounds: + continue + + # Scale coordinates by device pixel ratio and apply viewport offset + x1 = int((bounds.x + viewport_offset_x) * device_pixel_ratio) + y1 = int((bounds.y + viewport_offset_y) * device_pixel_ratio) + x2 = int((bounds.x + bounds.width + viewport_offset_x) * device_pixel_ratio) + y2 = int((bounds.y + bounds.height + viewport_offset_y) * device_pixel_ratio) + + # Ensure coordinates are within image bounds + img_width, img_height = image.size + x1 = max(0, min(x1, img_width)) + y1 = max(0, min(y1, img_height)) + x2 = max(x1, min(x2, img_width)) + y2 = max(y1, min(y2, img_height)) + + # Skip if bounding box is too small or invalid + if x2 - x1 < 2 or y2 - y1 < 2: + continue + + # Get element color based on type + tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div' + element_type = None + if hasattr(element, 'attributes') and element.attributes: + element_type = element.attributes.get('type') + + color = get_element_color(tag_name, element_type) + + # Get text for overlay (if short enough) + text = None + if hasattr(element, 'text') and element.text: + text = element.text.strip() + elif hasattr(element, 'attributes') and element.attributes: + # Try to get meaningful text from attributes + text = (element.attributes.get('aria-label') or + element.attributes.get('title') or + element.attributes.get('placeholder') or + element.attributes.get('value', '')) + + # Draw bounding box with optional text + draw_bounding_box_with_text( + draw, (x1, y1, x2, y2), color, text, font + ) + + except Exception as e: + logger.debug(f"Failed to draw highlight for element {element_id}: {e}") + continue + + # Convert back to base64 + output_buffer = io.BytesIO() + image.save(output_buffer, format='PNG') + output_buffer.seek(0) + + highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8') + + logger.debug(f"Successfully created highlighted screenshot with {len(selector_map)} elements") + return highlighted_b64 + + except Exception as e: + logger.error(f"Failed to create highlighted screenshot: {e}") + # Return original screenshot on error + return screenshot_b64 + +def get_viewport_info_from_cdp(cdp_session) -> Tuple[float, int, int]: + """Get viewport information from CDP session. + + Returns: + Tuple of (device_pixel_ratio, viewport_offset_x, viewport_offset_y) + """ + # This is a placeholder - in real implementation, you'd get this from CDP + # For now, return sensible defaults + return 1.0, 0, 0 + +async def create_highlighted_screenshot_async( + screenshot_b64: str, + selector_map: DOMSelectorMap, + cdp_session = None +) -> str: + """Async wrapper for creating highlighted screenshots. + + Args: + screenshot_b64: Base64 encoded screenshot + selector_map: Map of interactive elements + cdp_session: CDP session for getting viewport info + + Returns: + Base64 encoded highlighted screenshot + """ + # Get viewport information if CDP session is available + device_pixel_ratio = 1.0 + viewport_offset_x = 0 + viewport_offset_y = 0 + + if cdp_session: + try: + device_pixel_ratio, viewport_offset_x, viewport_offset_y = get_viewport_info_from_cdp(cdp_session) + except Exception as e: + logger.debug(f"Failed to get viewport info from CDP: {e}") + + # Create highlighted screenshot (run in thread pool if needed for performance) + return create_highlighted_screenshot( + screenshot_b64, + selector_map, + device_pixel_ratio, + viewport_offset_x, + viewport_offset_y + ) \ No newline at end of file From 1375013df791f0c8897a08160ff1e7ebeb95ea7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 17 Aug 2025 16:14:50 -0700 Subject: [PATCH 02/69] BBox on entire page works --- browser_use/browser/dom_watchdog.py | 204 +++++++---- browser_use/browser/python_highlights.py | 447 ++++++++++++----------- browser_use/browser/session.py | 3 +- 3 files changed, 378 insertions(+), 276 deletions(-) diff --git a/browser_use/browser/dom_watchdog.py b/browser_use/browser/dom_watchdog.py index 82d5144a0..a57e46ed6 100644 --- a/browser_use/browser/dom_watchdog.py +++ b/browser_use/browser/dom_watchdog.py @@ -232,69 +232,68 @@ class DOMWatchdog(BaseWatchdog): recent_events=self._get_recent_events_str() if event.include_recent_events else None, ) - # Execute DOM building and screenshot capture in parallel - dom_task = None - screenshot_task = None - - # Start DOM building task if requested - if event.include_dom: - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Starting DOM tree build task...') - - previous_state = ( - self.browser_session._cached_browser_state_summary.dom_state - if self.browser_session._cached_browser_state_summary - else None - ) - - dom_task = asyncio.create_task(self._build_dom_tree_without_highlights(previous_state)) - - # Start clean screenshot task if requested (without JS highlights) - if event.include_screenshot: - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 Starting clean screenshot task...') - screenshot_task = asyncio.create_task(self._capture_clean_screenshot()) - - # Wait for both tasks to complete - content = None - screenshot_b64 = None - - if dom_task: - try: - content = await dom_task - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ DOM tree build completed') - except Exception as e: - self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: DOM build failed: {e}, using minimal state') - content = SerializedDOMState(_root=None, selector_map={}) - else: - content = SerializedDOMState(_root=None, selector_map={}) - - if screenshot_task: - try: - screenshot_b64 = await screenshot_task - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Clean screenshot captured') - except Exception as e: - self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Clean screenshot failed: {e}') - screenshot_b64 = None - - # Apply Python-based highlighting if both DOM and screenshot are available - if (screenshot_b64 and content and content.selector_map and - self.browser_session.browser_profile.highlight_elements): - try: - self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🎨 Applying Python-based highlighting...') - from browser_use.browser.python_highlights import create_highlighted_screenshot_async - - # Get CDP session for viewport info - cdp_session = await self.browser_session.get_or_create_cdp_session() - - screenshot_b64 = await create_highlighted_screenshot_async( - screenshot_b64, content.selector_map, cdp_session + # Execute DOM building and screenshot capture in parallel + dom_task = None + screenshot_task = None + + # Start DOM building task if requested + if event.include_dom: + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Starting DOM tree build task...') + + previous_state = ( + self.browser_session._cached_browser_state_summary.dom_state + if self.browser_session._cached_browser_state_summary + else None ) - self.logger.debug(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements') - except Exception as e: - self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}') - - # Ensure we have valid content - if not content: - content = SerializedDOMState(_root=None, selector_map={}) + + dom_task = asyncio.create_task(self._build_dom_tree_without_highlights(previous_state)) + + # Start clean screenshot task if requested (without JS highlights) + if event.include_screenshot: + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 Starting clean screenshot task...') + screenshot_task = asyncio.create_task(self._capture_clean_screenshot()) + + # Wait for both tasks to complete + content = None + screenshot_b64 = None + + if dom_task: + try: + content = await dom_task + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ DOM tree build completed') + except Exception as e: + self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: DOM build failed: {e}, using minimal state') + content = SerializedDOMState(_root=None, selector_map={}) + else: + content = SerializedDOMState(_root=None, selector_map={}) + + if screenshot_task: + try: + screenshot_b64 = await screenshot_task + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Clean screenshot captured') + except Exception as e: + self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Clean screenshot failed: {e}') + screenshot_b64 = None + + # Apply Python-based highlighting if both DOM and screenshot are available + if screenshot_b64 and content and content.selector_map and self.browser_session.browser_profile.highlight_elements: + try: + self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🎨 Applying Python-based highlighting...') + from browser_use.browser.python_highlights import create_highlighted_screenshot_async + + # Get CDP session for viewport info + cdp_session = await self.browser_session.get_or_create_cdp_session() + + screenshot_b64 = await create_highlighted_screenshot_async(screenshot_b64, content.selector_map, cdp_session) + self.logger.debug( + f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements' + ) + except Exception as e: + self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}') + + # Ensure we have valid content + if not content: + content = SerializedDOMState(_root=None, selector_map={}) # Tabs info already fetched at the beginning @@ -466,6 +465,87 @@ class DOMWatchdog(BaseWatchdog): ) raise + async def _build_dom_tree_without_highlights(self, previous_state: SerializedDOMState | None = None) -> SerializedDOMState: + """Build DOM tree without injecting JavaScript highlights (for parallel execution).""" + try: + self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: STARTING DOM tree build') + + # Create or reuse DOM service + if self._dom_service is None: + self._dom_service = DomService(browser_session=self.browser_session, logger=self.logger) + + # Get serialized DOM tree using the service + self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: Calling DomService.get_serialized_dom_tree...') + start = time.time() + self.current_dom_state, self.enhanced_dom_tree, timing_info = await self._dom_service.get_serialized_dom_tree( + previous_cached_state=previous_state, + ) + end = time.time() + self.logger.debug( + '🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ DomService.get_serialized_dom_tree completed' + ) + + self.logger.debug(f'Time taken to get DOM tree: {end - start} seconds') + self.logger.debug(f'Timing breakdown: {timing_info}') + + # Update selector map for other watchdogs + self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: Updating selector maps...') + self.selector_map = self.current_dom_state.selector_map + # Update BrowserSession's cached selector map + if self.browser_session: + self.browser_session.update_cached_selector_map(self.selector_map) + self.logger.debug( + f'🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ Selector maps updated, {len(self.selector_map)} elements' + ) + + # Skip JavaScript highlighting injection - Python highlighting will be applied later + self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ COMPLETED DOM tree build (no JS highlights)') + return self.current_dom_state + + except Exception as e: + self.logger.error(f'Failed to build DOM tree without highlights: {e}') + self.event_bus.dispatch( + BrowserErrorEvent( + error_type='DOMBuildFailed', + message=str(e), + ) + ) + raise + + async def _capture_clean_screenshot(self) -> str: + """Capture a clean screenshot without JavaScript highlights.""" + try: + self.logger.debug('🔍 DOMWatchdog._capture_clean_screenshot: Capturing clean screenshot...') + + # Ensure we have a focused CDP session + assert self.browser_session.agent_focus is not None, 'No current target ID' + await self.browser_session.get_or_create_cdp_session(target_id=self.browser_session.agent_focus.target_id, focus=True) + + # Check if handler is registered + handlers = self.event_bus.handlers.get('ScreenshotEvent', []) + handler_names = [getattr(h, '__name__', str(h)) for h in handlers] + self.logger.debug(f'📸 ScreenshotEvent handlers registered: {len(handlers)} - {handler_names}') + + screenshot_event = self.event_bus.dispatch(ScreenshotEvent(full_page=False)) + self.logger.debug('📸 Dispatched ScreenshotEvent, waiting for event to complete...') + + # Wait for the event itself to complete (this waits for all handlers) + await screenshot_event + + # Get the single handler result + screenshot_b64 = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True) + if screenshot_b64 is None: + raise RuntimeError('Screenshot handler returned None') + self.logger.debug('🔍 DOMWatchdog._capture_clean_screenshot: ✅ Clean screenshot captured successfully') + return str(screenshot_b64) + + except TimeoutError: + self.logger.warning('📸 Clean screenshot timed out after 6 seconds - no handler registered or slow page?') + raise + except Exception as e: + self.logger.warning(f'📸 Clean screenshot failed: {type(e).__name__}: {e}') + raise + async def _wait_for_stable_network(self): """Wait for page stability - simplified for CDP-only branch.""" start_time = time.time() diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 357532c66..a07f59aaa 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -6,240 +6,261 @@ to draw bounding boxes around interactive elements directly on screenshots. import base64 import io -from typing import Dict, Any, Optional, Tuple -from PIL import Image, ImageDraw, ImageFont import logging +from typing import Optional, Tuple -from browser_use.dom.serializer.types import DOMSelectorMap +from PIL import Image, ImageDraw, ImageFont + +from browser_use.dom.views import DOMSelectorMap +from browser_use.observability import observe_debug logger = logging.getLogger(__name__) # Color scheme for different element types ELEMENT_COLORS = { - 'button': '#FF6B6B', # Red for buttons - 'input': '#4ECDC4', # Teal for inputs - 'select': '#45B7D1', # Blue for dropdowns - 'a': '#96CEB4', # Green for links - 'textarea': '#FFEAA7', # Yellow for text areas - 'default': '#DDA0DD', # Light purple for other interactive elements + 'button': '#FF6B6B', # Red for buttons + 'input': '#4ECDC4', # Teal for inputs + 'select': '#45B7D1', # Blue for dropdowns + 'a': '#96CEB4', # Green for links + 'textarea': '#FFEAA7', # Yellow for text areas + 'default': '#DDA0DD', # Light purple for other interactive elements } # Element type mappings ELEMENT_TYPE_MAP = { - 'button': 'button', - 'input': 'input', - 'select': 'select', - 'a': 'a', - 'textarea': 'textarea', + 'button': 'button', + 'input': 'input', + 'select': 'select', + 'a': 'a', + 'textarea': 'textarea', } + def get_element_color(tag_name: str, element_type: Optional[str] = None) -> str: - """Get color for element based on tag name and type.""" - # Check input type first - if tag_name == 'input' and element_type: - if element_type in ['button', 'submit']: - return ELEMENT_COLORS['button'] - - # Use tag-based color - return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default']) + """Get color for element based on tag name and type.""" + # Check input type first + if tag_name == 'input' and element_type: + if element_type in ['button', 'submit']: + return ELEMENT_COLORS['button'] + + # Use tag-based color + return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default']) + def should_show_text_overlay(text: Optional[str]) -> bool: - """Determine if text overlay should be shown based on length.""" - if not text: - return False - return len(text.strip()) <= 10 + """Determine if text overlay should be shown based on length.""" + if not text: + return False + return len(text.strip()) <= 10 + def draw_bounding_box_with_text( - draw: ImageDraw.Draw, - bbox: Tuple[int, int, int, int], - color: str, - text: Optional[str] = None, - font: Optional[ImageFont.FreeTypeFont] = None + draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues + bbox: Tuple[int, int, int, int], + color: str, + text: Optional[str] = None, + font: Optional[ImageFont.FreeTypeFont] = None, ) -> None: - """Draw a bounding box with optional text overlay.""" - x1, y1, x2, y2 = bbox - - # Draw bounding box with 2px width - for i in range(2): - draw.rectangle([x1 + i, y1 + i, x2 - i, y2 - i], outline=color, fill=None) - - # Draw text overlay if provided and short enough - if text and should_show_text_overlay(text): - try: - # Get text size - if font: - bbox_text = draw.textbbox((0, 0), text, font=font) - text_width = bbox_text[2] - bbox_text[0] - text_height = bbox_text[3] - bbox_text[1] - else: - # Fallback for default font - bbox_text = draw.textbbox((0, 0), text) - text_width = bbox_text[2] - bbox_text[0] - text_height = bbox_text[3] - bbox_text[1] - - # Position text at top-left of bounding box - text_x = max(0, x1) - text_y = max(0, y1 - text_height - 2) # Above the box - - # Draw background rectangle for text - draw.rectangle( - [text_x - 2, text_y - 2, text_x + text_width + 2, text_y + text_height + 2], - fill=color, - outline=None - ) - - # Draw text - draw.text((text_x, text_y), text, fill='white', font=font) - - except Exception as e: - logger.debug(f"Failed to draw text overlay: {e}") + """Draw a bounding box with optional text overlay.""" + x1, y1, x2, y2 = bbox + # Draw bounding box with 2px width + for i in range(2): + draw.rectangle([x1 + i, y1 + i, x2 - i, y2 - i], outline=color, fill=None) + + # Draw text overlay if provided and short enough + if text and should_show_text_overlay(text): + try: + # Get text size + if font: + bbox_text = draw.textbbox((0, 0), text, font=font) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + else: + # Fallback for default font + bbox_text = draw.textbbox((0, 0), text) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + + # Position text at top-left of bounding box + text_x = max(0, x1) + text_y = max(0, y1 - text_height - 2) # Above the box + + # Draw background rectangle for text + draw.rectangle([text_x - 2, text_y - 2, text_x + text_width + 2, text_y + text_height + 2], fill=color, outline=None) + + # Draw text + draw.text((text_x, text_y), text, fill='white', font=font) + + except Exception as e: + logger.debug(f'Failed to draw text overlay: {e}') + + +@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot') def create_highlighted_screenshot( - screenshot_b64: str, - selector_map: DOMSelectorMap, - device_pixel_ratio: float = 1.0, - viewport_offset_x: int = 0, - viewport_offset_y: int = 0 + screenshot_b64: str, + selector_map: DOMSelectorMap, + device_pixel_ratio: float = 1.0, + viewport_offset_x: int = 0, + viewport_offset_y: int = 0, ) -> str: - """Create a highlighted screenshot with bounding boxes around interactive elements. - - Args: - screenshot_b64: Base64 encoded screenshot - selector_map: Map of interactive elements with their positions - device_pixel_ratio: Device pixel ratio for scaling coordinates - viewport_offset_x: X offset for viewport positioning - viewport_offset_y: Y offset for viewport positioning - - Returns: - Base64 encoded highlighted screenshot - """ - try: - # Decode screenshot - screenshot_data = base64.b64decode(screenshot_b64) - image = Image.open(io.BytesIO(screenshot_data)).convert('RGBA') - - # Create drawing context - draw = ImageDraw.Draw(image) - - # Try to load a font, fall back to default if not available - font = None - try: - font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 12) - except: - try: - font = ImageFont.truetype("arial.ttf", 12) - except: - font = None # Use default font - - # Process each interactive element - for element_id, element in selector_map.items(): - try: - # Get element bounds - bounds = element.bounds - if not bounds: - continue - - # Scale coordinates by device pixel ratio and apply viewport offset - x1 = int((bounds.x + viewport_offset_x) * device_pixel_ratio) - y1 = int((bounds.y + viewport_offset_y) * device_pixel_ratio) - x2 = int((bounds.x + bounds.width + viewport_offset_x) * device_pixel_ratio) - y2 = int((bounds.y + bounds.height + viewport_offset_y) * device_pixel_ratio) - - # Ensure coordinates are within image bounds - img_width, img_height = image.size - x1 = max(0, min(x1, img_width)) - y1 = max(0, min(y1, img_height)) - x2 = max(x1, min(x2, img_width)) - y2 = max(y1, min(y2, img_height)) - - # Skip if bounding box is too small or invalid - if x2 - x1 < 2 or y2 - y1 < 2: - continue - - # Get element color based on type - tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div' - element_type = None - if hasattr(element, 'attributes') and element.attributes: - element_type = element.attributes.get('type') - - color = get_element_color(tag_name, element_type) - - # Get text for overlay (if short enough) - text = None - if hasattr(element, 'text') and element.text: - text = element.text.strip() - elif hasattr(element, 'attributes') and element.attributes: - # Try to get meaningful text from attributes - text = (element.attributes.get('aria-label') or - element.attributes.get('title') or - element.attributes.get('placeholder') or - element.attributes.get('value', '')) - - # Draw bounding box with optional text - draw_bounding_box_with_text( - draw, (x1, y1, x2, y2), color, text, font - ) - - except Exception as e: - logger.debug(f"Failed to draw highlight for element {element_id}: {e}") - continue - - # Convert back to base64 - output_buffer = io.BytesIO() - image.save(output_buffer, format='PNG') - output_buffer.seek(0) - - highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8') - - logger.debug(f"Successfully created highlighted screenshot with {len(selector_map)} elements") - return highlighted_b64 - - except Exception as e: - logger.error(f"Failed to create highlighted screenshot: {e}") - # Return original screenshot on error - return screenshot_b64 + """Create a highlighted screenshot with bounding boxes around interactive elements. -def get_viewport_info_from_cdp(cdp_session) -> Tuple[float, int, int]: - """Get viewport information from CDP session. - - Returns: - Tuple of (device_pixel_ratio, viewport_offset_x, viewport_offset_y) - """ - # This is a placeholder - in real implementation, you'd get this from CDP - # For now, return sensible defaults - return 1.0, 0, 0 + Args: + screenshot_b64: Base64 encoded screenshot + selector_map: Map of interactive elements with their positions + device_pixel_ratio: Device pixel ratio for scaling coordinates + viewport_offset_x: X offset for viewport positioning + viewport_offset_y: Y offset for viewport positioning -async def create_highlighted_screenshot_async( - screenshot_b64: str, - selector_map: DOMSelectorMap, - cdp_session = None -) -> str: - """Async wrapper for creating highlighted screenshots. - - Args: - screenshot_b64: Base64 encoded screenshot - selector_map: Map of interactive elements - cdp_session: CDP session for getting viewport info - - Returns: - Base64 encoded highlighted screenshot - """ - # Get viewport information if CDP session is available - device_pixel_ratio = 1.0 - viewport_offset_x = 0 - viewport_offset_y = 0 - - if cdp_session: - try: - device_pixel_ratio, viewport_offset_x, viewport_offset_y = get_viewport_info_from_cdp(cdp_session) - except Exception as e: - logger.debug(f"Failed to get viewport info from CDP: {e}") - - # Create highlighted screenshot (run in thread pool if needed for performance) - return create_highlighted_screenshot( - screenshot_b64, - selector_map, - device_pixel_ratio, - viewport_offset_x, - viewport_offset_y - ) \ No newline at end of file + Returns: + Base64 encoded highlighted screenshot + """ + try: + # Decode screenshot + screenshot_data = base64.b64decode(screenshot_b64) + image = Image.open(io.BytesIO(screenshot_data)).convert('RGBA') + + # Create drawing context + draw = ImageDraw.Draw(image) + + # Try to load a font, fall back to default if not available + font = None + try: + font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12) + except: + try: + font = ImageFont.truetype('arial.ttf', 12) + except: + font = None # Use default font + + # Process each interactive element + for element_id, element in selector_map.items(): + try: + # Use snapshot bounds (document coordinates) if available, otherwise absolute_position + bounds = None + if element.snapshot_node and element.snapshot_node.bounds: + bounds = element.snapshot_node.bounds + elif element.absolute_position: + bounds = element.absolute_position + + if not bounds: + continue + + # Convert from CSS pixels to device pixels for screenshot coordinates + # Note: bounds are already in CSS pixels, screenshot is in device pixels + x1 = int((bounds.x - viewport_offset_x) * device_pixel_ratio) + y1 = int((bounds.y - viewport_offset_y) * device_pixel_ratio) + x2 = int((bounds.x + bounds.width - viewport_offset_x) * device_pixel_ratio) + y2 = int((bounds.y + bounds.height - viewport_offset_y) * device_pixel_ratio) + + # Ensure coordinates are within image bounds + img_width, img_height = image.size + x1 = max(0, min(x1, img_width)) + y1 = max(0, min(y1, img_height)) + x2 = max(x1, min(x2, img_width)) + y2 = max(y1, min(y2, img_height)) + + # Skip if bounding box is too small or invalid + if x2 - x1 < 2 or y2 - y1 < 2: + continue + + # Get element color based on type + tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div' + element_type = None + if hasattr(element, 'attributes') and element.attributes: + element_type = element.attributes.get('type') + + color = get_element_color(tag_name, element_type) + + # Get text for overlay (if short enough) + text = None + if hasattr(element, 'node_value') and element.node_value: + text = element.node_value.strip() + elif hasattr(element, 'attributes') and element.attributes: + # Try to get meaningful text from attributes + text = ( + element.attributes.get('aria-label') + or element.attributes.get('title') + or element.attributes.get('placeholder') + or element.attributes.get('value', '') + ) + + # Draw bounding box with optional text + draw_bounding_box_with_text(draw, (x1, y1, x2, y2), color, text, font) + + except Exception as e: + logger.debug(f'Failed to draw highlight for element {element_id}: {e}') + continue + + # Convert back to base64 + output_buffer = io.BytesIO() + image.save(output_buffer, format='PNG') + output_buffer.seek(0) + + highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8') + + logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements') + return highlighted_b64 + + except Exception as e: + logger.error(f'Failed to create highlighted screenshot: {e}') + # Return original screenshot on error + return screenshot_b64 + + +async def get_viewport_info_from_cdp(cdp_session) -> Tuple[float, int, int]: + """Get viewport information from CDP session. + + Returns: + Tuple of (device_pixel_ratio, scroll_x, scroll_y) + """ + try: + # Get layout metrics which includes viewport info and device pixel ratio + metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id) + + # Extract viewport information + visual_viewport = metrics.get('visualViewport', {}) + css_visual_viewport = metrics.get('cssVisualViewport', {}) + css_layout_viewport = metrics.get('cssLayoutViewport', {}) + + # Calculate device pixel ratio + css_width = css_visual_viewport.get('clientWidth', css_layout_viewport.get('clientWidth', 1280.0)) + device_width = visual_viewport.get('clientWidth', css_width) + device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0 + + # Get scroll position in CSS pixels + scroll_x = int(css_visual_viewport.get('pageX', 0)) + scroll_y = int(css_visual_viewport.get('pageY', 0)) + + return float(device_pixel_ratio), scroll_x, scroll_y + + except Exception as e: + logger.debug(f'Failed to get viewport info from CDP: {e}') + return 1.0, 0, 0 + + +async def create_highlighted_screenshot_async(screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None) -> str: + """Async wrapper for creating highlighted screenshots. + + Args: + screenshot_b64: Base64 encoded screenshot + selector_map: Map of interactive elements + cdp_session: CDP session for getting viewport info + + Returns: + Base64 encoded highlighted screenshot + """ + # Get viewport information if CDP session is available + device_pixel_ratio = 1.0 + viewport_offset_x = 0 + viewport_offset_y = 0 + + if cdp_session: + try: + device_pixel_ratio, viewport_offset_x, viewport_offset_y = await get_viewport_info_from_cdp(cdp_session) + except Exception as e: + logger.debug(f'Failed to get viewport info from CDP: {e}') + + # Create highlighted screenshot (run in thread pool if needed for performance) + return create_highlighted_screenshot(screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 78747141f..2e28f17a4 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -35,6 +35,7 @@ from browser_use.browser.events import ( from browser_use.browser.profile import BrowserProfile from browser_use.browser.views import BrowserStateSummary, TabInfo from browser_use.dom.views import EnhancedDOMTreeNode, TargetInfo +from browser_use.observability import observe_debug from browser_use.utils import _log_pretty_url, is_new_tab_page DEFAULT_BROWSER_PROFILE = BrowserProfile() @@ -737,7 +738,7 @@ class BrowserSession(BaseModel): return self.agent_focus.session_id if self.agent_focus else None # ========== Helper Methods ========== - + @observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_summary') async def get_browser_state_summary( self, cache_clickable_elements_hashes: bool = True, From 78a1f70adeec96047a086bafc2db96de1e240536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 17 Aug 2025 16:32:16 -0700 Subject: [PATCH 03/69] Use dash lines for highlights --- browser_use/browser/python_highlights.py | 68 +++++++++++++++++++----- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index a07f59aaa..93744558f 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -51,7 +51,8 @@ def should_show_text_overlay(text: Optional[str]) -> bool: """Determine if text overlay should be shown based on length.""" if not text: return False - return len(text.strip()) <= 10 + # Always show text overlay if we have text (since we now include element indices) + return True def draw_bounding_box_with_text( @@ -64,9 +65,41 @@ def draw_bounding_box_with_text( """Draw a bounding box with optional text overlay.""" x1, y1, x2, y2 = bbox - # Draw bounding box with 2px width - for i in range(2): - draw.rectangle([x1 + i, y1 + i, x2 - i, y2 - i], outline=color, fill=None) + # Draw dashed bounding box + dash_length = 8 + gap_length = 4 + + # Top edge + x = x1 + while x < x2: + end_x = min(x + dash_length, x2) + draw.line([(x, y1), (end_x, y1)], fill=color, width=2) + draw.line([(x, y1 + 1), (end_x, y1 + 1)], fill=color, width=2) + x += dash_length + gap_length + + # Bottom edge + x = x1 + while x < x2: + end_x = min(x + dash_length, x2) + draw.line([(x, y2), (end_x, y2)], fill=color, width=2) + draw.line([(x, y2 - 1), (end_x, y2 - 1)], fill=color, width=2) + x += dash_length + gap_length + + # Left edge + y = y1 + while y < y2: + end_y = min(y + dash_length, y2) + draw.line([(x1, y), (x1, end_y)], fill=color, width=2) + draw.line([(x1 + 1, y), (x1 + 1, end_y)], fill=color, width=2) + y += dash_length + gap_length + + # Right edge + y = y1 + while y < y2: + end_y = min(y + dash_length, y2) + draw.line([(x2, y), (x2, end_y)], fill=color, width=2) + draw.line([(x2 - 1, y), (x2 - 1, end_y)], fill=color, width=2) + y += dash_length + gap_length # Draw text overlay if provided and short enough if text and should_show_text_overlay(text): @@ -127,11 +160,11 @@ def create_highlighted_screenshot( # Try to load a font, fall back to default if not available font = None try: - font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12) - except: + font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 9) + except (OSError, IOError): try: - font = ImageFont.truetype('arial.ttf', 12) - except: + font = ImageFont.truetype('arial.ttf', 9) + except (OSError, IOError): font = None # Use default font # Process each interactive element @@ -175,10 +208,15 @@ def create_highlighted_screenshot( # Get text for overlay (if short enough) text = None - if hasattr(element, 'node_value') and element.node_value: - text = element.node_value.strip() - elif hasattr(element, 'attributes') and element.attributes: - # Try to get meaningful text from attributes + + # First try to get element text content + if hasattr(element, 'get_all_children_text'): + element_text = element.get_all_children_text() + if element_text and element_text.strip(): + text = element_text.strip() + + # If no text content, try attributes + if not text and hasattr(element, 'attributes') and element.attributes: text = ( element.attributes.get('aria-label') or element.attributes.get('title') @@ -186,6 +224,11 @@ def create_highlighted_screenshot( or element.attributes.get('value', '') ) + # Always show only the element index (no text content) + text = None + if hasattr(element, 'element_index') and element.element_index is not None: + text = str(element.element_index) + # Draw bounding box with optional text draw_bounding_box_with_text(draw, (x1, y1, x2, y2), color, text, font) @@ -240,6 +283,7 @@ async def get_viewport_info_from_cdp(cdp_session) -> Tuple[float, int, int]: return 1.0, 0, 0 +@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot_async') async def create_highlighted_screenshot_async(screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None) -> str: """Async wrapper for creating highlighted screenshots. From 8d7bf5222320c9f3b7f3b6a06ccb6f3c2f7ca067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 17 Aug 2025 17:01:23 -0700 Subject: [PATCH 04/69] bigger highlights and less dashes --- browser_use/browser/python_highlights.py | 69 ++++++++++-------------- 1 file changed, 27 insertions(+), 42 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 93744558f..59ccb9b1c 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -47,12 +47,9 @@ def get_element_color(tag_name: str, element_type: Optional[str] = None) -> str: return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default']) -def should_show_text_overlay(text: Optional[str]) -> bool: - """Determine if text overlay should be shown based on length.""" - if not text: - return False - # Always show text overlay if we have text (since we now include element indices) - return True +def should_show_index_overlay(element_index: Optional[int]) -> bool: + """Determine if index overlay should be shown.""" + return element_index is not None def draw_bounding_box_with_text( @@ -66,8 +63,8 @@ def draw_bounding_box_with_text( x1, y1, x2, y2 = bbox # Draw dashed bounding box - dash_length = 8 - gap_length = 4 + dash_length = 2 + gap_length = 6 # Top edge x = x1 @@ -101,8 +98,8 @@ def draw_bounding_box_with_text( draw.line([(x2 - 1, y), (x2 - 1, end_y)], fill=color, width=2) y += dash_length + gap_length - # Draw text overlay if provided and short enough - if text and should_show_text_overlay(text): + # Draw index overlay if we have index text + if text: try: # Get text size if font: @@ -115,15 +112,22 @@ def draw_bounding_box_with_text( text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] - # Position text at top-left of bounding box + # Position text at top-left of bounding box with padding + padding = 3 text_x = max(0, x1) - text_y = max(0, y1 - text_height - 2) # Above the box + text_y = max(0, y1 - text_height - padding * 2) if y1 > text_height + padding * 2 else y1 + padding - # Draw background rectangle for text - draw.rectangle([text_x - 2, text_y - 2, text_x + text_width + 2, text_y + text_height + 2], fill=color, outline=None) + # Draw background rectangle for maximum contrast + bg_x1 = text_x - padding + bg_y1 = text_y - padding + bg_x2 = text_x + text_width + padding + bg_y2 = text_y + text_height + padding - # Draw text - draw.text((text_x, text_y), text, fill='white', font=font) + # Use white background with thick black border for maximum visibility + draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill='white', outline='black', width=2) + + # Draw bold dark text on light background for best contrast + draw.text((text_x, text_y), text, fill='black', font=font) except Exception as e: logger.debug(f'Failed to draw text overlay: {e}') @@ -160,10 +164,10 @@ def create_highlighted_screenshot( # Try to load a font, fall back to default if not available font = None try: - font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 9) + font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12) except (OSError, IOError): try: - font = ImageFont.truetype('arial.ttf', 9) + font = ImageFont.truetype('arial.ttf', 12) except (OSError, IOError): font = None # Use default font @@ -206,31 +210,12 @@ def create_highlighted_screenshot( color = get_element_color(tag_name, element_type) - # Get text for overlay (if short enough) - text = None + # Get element index for overlay + element_index = getattr(element, 'element_index', None) + index_text = str(element_index) if element_index is not None else None - # First try to get element text content - if hasattr(element, 'get_all_children_text'): - element_text = element.get_all_children_text() - if element_text and element_text.strip(): - text = element_text.strip() - - # If no text content, try attributes - if not text and hasattr(element, 'attributes') and element.attributes: - text = ( - element.attributes.get('aria-label') - or element.attributes.get('title') - or element.attributes.get('placeholder') - or element.attributes.get('value', '') - ) - - # Always show only the element index (no text content) - text = None - if hasattr(element, 'element_index') and element.element_index is not None: - text = str(element.element_index) - - # Draw bounding box with optional text - draw_bounding_box_with_text(draw, (x1, y1, x2, y2), color, text, font) + # Draw bounding box with index + draw_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font) except Exception as e: logger.debug(f'Failed to draw highlight for element {element_id}: {e}') From ae585cf2c5366b318fd301a0de13369778f0e19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 17 Aug 2025 17:14:57 -0700 Subject: [PATCH 05/69] Highlights clever placement --- browser_use/browser/python_highlights.py | 31 +++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 59ccb9b1c..a1d34f35b 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -112,10 +112,35 @@ def draw_bounding_box_with_text( text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] - # Position text at top-left of bounding box with padding + # Smart positioning based on element size padding = 3 - text_x = max(0, x1) - text_y = max(0, y1 - text_height - padding * 2) if y1 > text_height + padding * 2 else y1 + padding + element_width = x2 - x1 + element_height = y2 - y1 + element_area = element_width * element_height + index_box_area = (text_width + padding * 2) * (text_height + padding * 2) + + # Calculate size ratio to determine positioning strategy + size_ratio = element_area / max(index_box_area, 1) + + if size_ratio < 4: + # Very small elements: place outside in bottom-right corner + text_x = x2 + padding + text_y = y2 - text_height + # Ensure it doesn't go off screen + text_x = min(text_x, 1200 - text_width - padding) + text_y = max(text_y, 0) + elif size_ratio < 16: + # Medium elements: place in bottom-right corner inside + text_x = x2 - text_width - padding + text_y = y2 - text_height - padding + else: + # Large elements: place in center + text_x = x1 + (element_width - text_width) // 2 + text_y = y1 + (element_height - text_height) // 2 + + # Ensure text stays within bounds + text_x = max(0, min(text_x, 1200 - text_width)) + text_y = max(0, min(text_y, 800 - text_height)) # Draw background rectangle for maximum contrast bg_x1 = text_x - padding From 8c32861c4cfee9187f941e9e6aaecb8102804b70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Wed, 27 Aug 2025 11:51:47 -0700 Subject: [PATCH 06/69] Add logging if setup complete --- browser_use/agent/service.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index a518b6fc3..3dc3329f0 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -1272,6 +1272,12 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug(f'✅ Added navigation to {initial_url} as initial action') + # Ensure browser focus is properly established before executing initial actions + if self.browser_session and self.browser_session.agent_focus: + self.logger.debug(f'🎯 Browser focus established on target: {self.browser_session.agent_focus.target_id[-4:]}') + else: + self.logger.warning('⚠️ No browser focus established, may cause navigation issues') + # Execute initial actions if provided if self.initial_actions and not self.state.follow_up_task: self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...') From e07c229104c453ebd0b94825798eaba248e3e65f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Wed, 27 Aug 2025 12:13:08 -0700 Subject: [PATCH 07/69] Skip switch tab --- browser_use/browser/session.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 9c0b180f7..7c6d7555f 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -584,10 +584,18 @@ class BrowserSession(BaseModel): # Use current tab target_id = target_id or self.agent_focus.target_id - # Activate target (bring to foreground) - await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id)) - # which does this for us: - # self.agent_focus = await self.get_or_create_cdp_session(target_id) + # Only switch tab if we're not already on the target tab + if self.agent_focus is None or self.agent_focus.target_id != target_id: + self.logger.debug( + f'[on_NavigateToUrlEvent] Switching to target tab {target_id[-4:]} (current: {self.agent_focus.target_id[-4:] if self.agent_focus else "none"})' + ) + # Activate target (bring to foreground) + await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id)) + # which does this for us: + # self.agent_focus = await self.get_or_create_cdp_session(target_id) + else: + self.logger.debug(f'[on_NavigateToUrlEvent] Already on target tab {target_id[-4:]}, skipping SwitchTabEvent') + assert self.agent_focus is not None and self.agent_focus.target_id == target_id, ( 'Agent focus not updated to new target_id after SwitchTabEvent should have switched to it' ) From 78049eee07941724b0658926c9fbdb90caabeaf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Wed, 27 Aug 2025 12:26:59 -0700 Subject: [PATCH 08/69] Remove 0.5 s waiting in navigation --- browser_use/browser/session.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 7c6d7555f..0f270fbd1 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -613,8 +613,8 @@ class BrowserSession(BaseModel): session_id=self.agent_focus.session_id, ) - # Wait a bit to ensure page starts loading - await asyncio.sleep(0.5) + # # Wait a bit to ensure page starts loading + # await asyncio.sleep(0.5) # Dispatch navigation complete self.logger.debug(f'Dispatching NavigationCompleteEvent for {event.url} (tab #{target_id[-4:]})') From 0dd27ca9760c01fd9007fa1829e59afb3063335e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:37:17 -0700 Subject: [PATCH 09/69] Remove js in downloadwatchdog --- .../browser/watchdogs/downloads_watchdog.py | 184 ++++++++++-------- 1 file changed, 102 insertions(+), 82 deletions(-) diff --git a/browser_use/browser/watchdogs/downloads_watchdog.py b/browser_use/browser/watchdogs/downloads_watchdog.py index f0bd5c986..5004ee097 100644 --- a/browser_use/browser/watchdogs/downloads_watchdog.py +++ b/browser_use/browser/watchdogs/downloads_watchdog.py @@ -114,6 +114,8 @@ class DownloadsWatchdog(BaseWatchdog): self.logger.debug('[DownloadsWatchdog] Skipping PDF check - auto-download disabled') return + # Note: Using network-based PDF detection that doesn't require JavaScript + target_id = event.target_id self.logger.debug(f'[DownloadsWatchdog] Got target_id={target_id} for tab #{event.target_id[-4:]}') @@ -552,8 +554,9 @@ class DownloadsWatchdog(BaseWatchdog): del self._active_downloads[download_id] async def check_for_pdf_viewer(self, target_id: TargetID) -> bool: - """Check if the current target is Chrome's built-in PDF viewer. + """Check if the current target is a PDF using network-based detection. + This method avoids JavaScript execution that can crash WebSocket connections. Returns True if a PDF is detected and should be downloaded. """ self.logger.debug(f'[DownloadsWatchdog] Checking if target {target_id} is PDF viewer...') @@ -575,98 +578,115 @@ class DownloadsWatchdog(BaseWatchdog): return cached_result try: - # Create a temporary CDP session for this target without switching focus - import asyncio - - temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False) - - result = await asyncio.wait_for( - temp_session.cdp_client.send.Runtime.evaluate( - params={ - 'expression': """ - (() => { - // Check for Chrome's built-in PDF viewer (both old and new selectors) - const pdfEmbed = document.querySelector('embed[type="application/x-google-chrome-pdf"]') || - document.querySelector('embed[type="application/pdf"]'); - if (pdfEmbed) { - // For Chrome PDF viewer, use window.location.href not embed.src (which is often about:blank) - return { - isPdf: true, - url: window.location.href, - isChromePdfViewer: true - }; - } - - // Check for direct PDF navigation - if (document.contentType === 'application/pdf') { - return { - isPdf: true, - url: window.location.href, - isDirectPdf: true - }; - } - - // Also check if the URL ends with .pdf or has PDF in it - const url = window.location.href; - const isPdfUrl = url.toLowerCase().includes('.pdf'); - if (isPdfUrl) { - return { - isPdf: true, - url: url, - isPdfUrl: true - }; - } - - // Check for PDF in iframe - const iframes = document.querySelectorAll('iframe'); - for (const iframe of iframes) { - try { - const iframeDoc = iframe.contentDocument || iframe.contentWindow.document; - if (iframeDoc.contentType === 'application/pdf') { - return { - isPdf: true, - url: iframe.src, - isIframePdf: true - }; - } - } catch (e) { - // Cross-origin iframe, skip - } - } - - return { isPdf: false }; - })() - """, - 'returnByValue': True, - }, - session_id=temp_session.session_id, - ), - timeout=5.0, # 5 second timeout to prevent hanging - ) - - # No need to detach - session is cached - is_pdf_viewer = result.get('result', {}).get('value', {}) - - if is_pdf_viewer.get('isPdf', False): - self.logger.debug( - f'[DownloadsWatchdog] PDF detected: {is_pdf_viewer.get("url", "unknown")} ' - f'(type: {"Chrome viewer" if is_pdf_viewer.get("isChromePdfViewer") else "direct PDF" if is_pdf_viewer.get("isDirectPdf") else "PDF URL" if is_pdf_viewer.get("isPdfUrl") else "iframe PDF"})' - ) + # Method 1: Check URL patterns (fastest, most reliable) + url_is_pdf = self._check_url_for_pdf(page_url) + if url_is_pdf: + self.logger.debug(f'[DownloadsWatchdog] PDF detected via URL pattern: {page_url}') self._pdf_viewer_cache[page_url] = True return True + # Method 2: Check network response headers via CDP (safer than JavaScript) + header_is_pdf = await self._check_network_headers_for_pdf(target_id) + if header_is_pdf: + self.logger.debug(f'[DownloadsWatchdog] PDF detected via network headers: {page_url}') + self._pdf_viewer_cache[page_url] = True + return True + + # Method 3: Check Chrome's PDF viewer specific URLs + chrome_pdf_viewer = self._is_chrome_pdf_viewer_url(page_url) + if chrome_pdf_viewer: + self.logger.debug(f'[DownloadsWatchdog] Chrome PDF viewer detected: {page_url}') + self._pdf_viewer_cache[page_url] = True + return True + + # Not a PDF self._pdf_viewer_cache[page_url] = False return False - except TimeoutError: - self.logger.warning(f'[DownloadsWatchdog] ❌ PDF check timed out for target: {page_url}') - self._pdf_viewer_cache[page_url] = False - return False except Exception as e: self.logger.warning(f'[DownloadsWatchdog] ❌ Error checking for PDF viewer: {e}') self._pdf_viewer_cache[page_url] = False return False + def _check_url_for_pdf(self, url: str) -> bool: + """Check if URL indicates a PDF file.""" + if not url: + return False + + url_lower = url.lower() + + # Direct PDF file extensions + if url_lower.endswith('.pdf'): + return True + + # PDF in path + if '.pdf' in url_lower: + return True + + # PDF MIME type in URL parameters + if any( + param in url_lower + for param in [ + 'content-type=application/pdf', + 'content-type=application%2fpdf', + 'mimetype=application/pdf', + 'type=application/pdf', + ] + ): + return True + + return False + + def _is_chrome_pdf_viewer_url(self, url: str) -> bool: + """Check if this is Chrome's internal PDF viewer URL.""" + if not url: + return False + + url_lower = url.lower() + + # Chrome PDF viewer uses chrome-extension:// URLs + if 'chrome-extension://' in url_lower and 'pdf' in url_lower: + return True + + # Chrome PDF viewer internal URLs + if url_lower.startswith('chrome://') and 'pdf' in url_lower: + return True + + return False + + async def _check_network_headers_for_pdf(self, target_id: TargetID) -> bool: + """Check network response headers for PDF content-type.""" + try: + import asyncio + + # Get CDP session + temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False) + + # Get navigation history to find the main resource + history = await asyncio.wait_for( + temp_session.cdp_client.send.Page.getNavigationHistory(session_id=temp_session.session_id), timeout=3.0 + ) + + current_entry = history.get('entries', []) + if current_entry: + current_index = history.get('currentIndex', 0) + if 0 <= current_index < len(current_entry): + current_url = current_entry[current_index].get('url', '') + + # Check if the URL itself suggests PDF + if self._check_url_for_pdf(current_url): + return True + + # Note: CDP doesn't easily expose response headers for completed navigations + # For more complex cases, we'd need to set up Network.responseReceived listeners + # before navigation, but that's overkill for most PDF detection cases + + return False + + except Exception as e: + self.logger.debug(f'[DownloadsWatchdog] Network headers check failed (non-critical): {e}') + return False + async def trigger_pdf_download(self, target_id: TargetID) -> str | None: """Trigger download of a PDF from Chrome's PDF viewer. From 21febf82ad08f1c53a171e3a2c4f53acb3cf2340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:41:13 -0700 Subject: [PATCH 10/69] Check for highlight_elements flag --- browser_use/browser/session.py | 3 +++ browser_use/browser/watchdogs/dom_watchdog.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 0f270fbd1..3af32249c 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -1527,6 +1527,9 @@ class BrowserSession(BaseModel): async def remove_highlights(self) -> None: """Remove highlights from the page using CDP.""" + if not self.browser_profile.highlight_elements: + return + try: # Get cached session cdp_session = await self.get_or_create_cdp_session() diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index bafa37ada..372d1d414 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -452,7 +452,7 @@ class DOMWatchdog(BaseWatchdog): self.logger.debug(f'🔍 DOMWatchdog._build_dom_tree: ✅ Selector maps updated, {len(self.selector_map)} elements') # Inject highlighting for visual feedback if we have elements - if self.selector_map and self._dom_service: + if self.selector_map and self._dom_service and self.browser_session.browser_profile.highlight_elements: try: self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Injecting highlighting script...') from browser_use.dom.debug.highlights import inject_highlighting_script From efb2f8bfaed90cefcd5d0052d32dd9675cbbb5dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:21:35 -0700 Subject: [PATCH 11/69] Autopdf download to false --- browser_use/browser/profile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index 72a53b33a..e29d910b0 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -585,7 +585,9 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro highlight_elements: bool = Field(default=True, description='Highlight interactive elements on the page.') # --- Downloads --- - auto_download_pdfs: bool = Field(default=True, description='Automatically download PDFs when navigating to PDF viewer pages.') + auto_download_pdfs: bool = Field( + default=False, description='Automatically download PDFs when navigating to PDF viewer pages.' + ) profile_directory: str = 'Default' # e.g. 'Profile 1', 'Profile 2', 'Custom Profile', etc. From c969b956d665e2bc0294accdb7a4b5cc6f43d2d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:24:59 -0700 Subject: [PATCH 12/69] Logger statement --- browser_use/browser/session.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 3af32249c..5452247d1 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -853,6 +853,8 @@ class BrowserSession(BaseModel): cdp_url=self.cdp_url if should_use_new_socket else None, ) self._cdp_session_pool[target_id] = session + # log length of _cdp_session_pool + self.logger.debug(f'[get_or_create_cdp_session] new _cdp_session_pool length: {len(self._cdp_session_pool)}') # Only change agent focus if requested if focus: From d6ea63e431ab339bf745f805496a2554a59fb713 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:25:52 -0700 Subject: [PATCH 13/69] reverted docs to API V1 --- docs/docs.json | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/docs/docs.json b/docs/docs.json index f6fb21e77..e7a146b4e 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -129,31 +129,6 @@ { "tab": "Cloud", "versions": [ - { - "version": "v2", - "groups": [ - { - "group": "Get Started", - "pages": [ - "cloud/v2/quickstart", - "cloud/v2/python-quickstart", - "cloud/v2/node-quickstart" - ] - }, - { - "group": "Platform", - "pages": [ - "cloud/v1/pricing", - "cloud/v1/n8n-browser-use-integration", - "cloud/v1/search" - ] - }, - { - "group": "REST API reference", - "openapi": "https://app.stainless.com/api/spec/documented/browser-use/openapi.documented.yml" - } - ] - }, { "version": "v1", "groups": [ @@ -180,6 +155,27 @@ "openapi": "https://api.browser-use.com/api/v1/openapi.json" } ] + }, + { + "version": "v2", + "groups": [ + { + "group": "Get Started", + "pages": [ + "cloud/v2/quickstart", + "cloud/v2/python-quickstart", + "cloud/v2/node-quickstart" + ] + }, + { + "group": "Platform", + "pages": [ + "cloud/v1/pricing", + "cloud/v1/n8n-browser-use-integration", + "cloud/v1/search" + ] + } + ] } ] } From 9915ea41a9d151e25cd7e71ff8ca8e80e6879340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:02:50 -0700 Subject: [PATCH 14/69] =?UTF-8?q?Remove=20=F0=9F=93=8D=20from=20logs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- browser_use/agent/service.py | 2 +- browser_use/browser/watchdogs/default_action_watchdog.py | 2 +- browser_use/browser/watchdogs/dom_watchdog.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 3dc3329f0..473361605 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -1160,7 +1160,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): unique_urls = list(set(found_urls)) # If multiple URLs found, skip directly_open_urling if len(unique_urls) > 1: - self.logger.debug(f'📍 Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity') + self.logger.debug(f'Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity') return None # If exactly one URL found, return it diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 476b629a4..3297cbc52 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -708,7 +708,7 @@ class DefaultActionWatchdog(BaseWatchdog): center_x = bounds['x'] + bounds['width'] / 2 center_y = bounds['y'] + bounds['height'] / 2 input_coordinates = {'input_x': center_x, 'input_y': center_y} - self.logger.debug(f'📍 Input coordinates: x={center_x:.1f}, y={center_y:.1f}') + self.logger.debug(f'Input coordinates: x={center_x:.1f}, y={center_y:.1f}') # Provide helpful warnings for common issues if not element_info.get('visible', False): diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index 372d1d414..077ae36d4 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -164,10 +164,10 @@ class DOMWatchdog(BaseWatchdog): self.logger.debug(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Got page URL: {page_url}') if self.browser_session.agent_focus: self.logger.debug( - f'📍 Current page URL: {page_url}, target_id: {self.browser_session.agent_focus.target_id}, session_id: {self.browser_session.agent_focus.session_id}' + f'Current page URL: {page_url}, target_id: {self.browser_session.agent_focus.target_id}, session_id: {self.browser_session.agent_focus.session_id}' ) else: - self.logger.debug(f'📍 Current page URL: {page_url}, no cdp_session attached') + self.logger.debug(f'Current page URL: {page_url}, no cdp_session attached') # check if we should skip DOM tree build for pointless pages not_a_meaningful_website = page_url.lower().split(':', 1)[0] not in ('http', 'https') From 05a98e98b32dcdb88f682e2f1d7177ba060f7d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:25:13 -0700 Subject: [PATCH 15/69] Improve memory msg when using ctrl --- browser_use/tools/service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 20948f908..9ecfb65be 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -285,13 +285,13 @@ class Tools(Generic[Context]): # Wait for handler to complete and get any exception or metadata click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False) memory = f'Clicked element with index {params.index}' + if params.while_holding_ctrl: + memory += ' and opened in new tab' msg = f'🖱️ {memory}' logger.info(msg) # Include click coordinates in metadata if available return ActionResult( - extracted_content=memory, - include_in_memory=True, long_term_memory=memory, metadata=click_metadata if isinstance(click_metadata, dict) else None, ) From 9dd5b27c1f4b9b4949799e8fbbd6e96deeaf712b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:25:25 -0700 Subject: [PATCH 16/69] Logs --- browser_use/browser/watchdogs/downloads_watchdog.py | 1 - browser_use/dom/debug/highlights.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/browser_use/browser/watchdogs/downloads_watchdog.py b/browser_use/browser/watchdogs/downloads_watchdog.py index 5004ee097..eeec2555d 100644 --- a/browser_use/browser/watchdogs/downloads_watchdog.py +++ b/browser_use/browser/watchdogs/downloads_watchdog.py @@ -111,7 +111,6 @@ class DownloadsWatchdog(BaseWatchdog): # Check if auto-download is enabled auto_download_enabled = self._is_auto_download_enabled() if not auto_download_enabled: - self.logger.debug('[DownloadsWatchdog] Skipping PDF check - auto-download disabled') return # Note: Using network-based PDF detection that doesn't require JavaScript diff --git a/browser_use/dom/debug/highlights.py b/browser_use/dom/debug/highlights.py index b712d0596..cf8b3d2f1 100644 --- a/browser_use/dom/debug/highlights.py +++ b/browser_use/dom/debug/highlights.py @@ -100,7 +100,7 @@ async def inject_highlighting_script(dom_service: DomService, interactive_elemen # Convert DOMSelectorMap to the format expected by the JavaScript converted_elements = convert_dom_selector_map_to_highlight_format(interactive_elements) - logger.debug(f'📍 Creating CSP-safe highlighting for {len(converted_elements)} elements') + logger.debug(f'Creating CSP-safe highlighting for {len(converted_elements)} elements') # ALWAYS remove any existing highlights first to prevent double-highlighting await remove_highlighting_script(dom_service) From c66936193cc93f8948defe7f1a70f57768285c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:29:31 -0700 Subject: [PATCH 17/69] Remove include_in_memory parameter --- browser_use/tools/service.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 9ecfb65be..bd9e73b09 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -177,7 +177,7 @@ class Tools(Generic[Context]): memory = f"Searched Google for '{params.query}'" msg = f'🔍 {memory}' logger.info(msg) - return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: logger.error(f'Failed to search Google: {e}') clean_msg = extract_llm_error_message(e) @@ -201,7 +201,7 @@ class Tools(Generic[Context]): msg = f'🔗 {memory}' logger.info(msg) - return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=msg, long_term_memory=memory) except Exception as e: error_msg = str(e) # Always log the actual error first for debugging @@ -336,7 +336,6 @@ class Tools(Generic[Context]): # Include input coordinates in metadata if available return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f"Input '{params.text}' into element {params.index}.", metadata=input_metadata if isinstance(input_metadata, dict) else None, ) @@ -486,7 +485,6 @@ class Tools(Generic[Context]): logger.info(f'📁 {msg}') return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f'Uploaded file {params.path} to element {params.index}', ) except Exception as e: @@ -512,7 +510,7 @@ class Tools(Generic[Context]): assert new_target_id, 'SwitchTabEvent did not return a TargetID for the new tab that was switched to' memory = f'Switched to Tab with ID {new_target_id[-4:]}' logger.info(f'🔄 {memory}') - return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: logger.error(f'Failed to switch tab: {type(e).__name__}: {e}') clean_msg = extract_llm_error_message(e) @@ -535,7 +533,6 @@ class Tools(Generic[Context]): logger.info(f'🗑️ {memory}') return ActionResult( extracted_content=memory, - include_in_memory=True, long_term_memory=memory, ) except Exception as e: @@ -697,7 +694,7 @@ Provide the extracted information in a clear, structured format.""" msg = f'🔍 {long_term_memory}' logger.info(msg) - return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=long_term_memory) + return ActionResult(extracted_content=msg, long_term_memory=long_term_memory) except Exception as e: logger.error(f'Failed to dispatch ScrollEvent: {type(e).__name__}: {e}') clean_msg = extract_llm_error_message(e) @@ -717,7 +714,7 @@ Provide the extracted information in a clear, structured format.""" memory = f'Sent keys: {params.keys}' msg = f'⌨️ {memory}' logger.info(msg) - return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: logger.error(f'Failed to dispatch SendKeysEvent: {type(e).__name__}: {e}') clean_msg = extract_llm_error_message(e) @@ -737,14 +734,13 @@ Provide the extracted information in a clear, structured format.""" memory = f'Scrolled to text: {text}' msg = f'🔍 {memory}' logger.info(msg) - return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: # Text not found msg = f"Text '{text}' not found or not visible on page" logger.info(msg) return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f"Tried scrolling to text '{text}' but it was not found", ) @@ -776,7 +772,6 @@ Provide the extracted information in a clear, structured format.""" return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f'Found {options_count} dropdown options for index {params.index}', include_extracted_content_only_once=True, ) @@ -806,7 +801,6 @@ Provide the extracted information in a clear, structured format.""" return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}", ) @@ -831,7 +825,7 @@ Provide the extracted information in a clear, structured format.""" else: result = await file_system.write_file(file_name, content) logger.info(f'💾 {result}') - return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result) + return ActionResult(extracted_content=result, long_term_memory=result) @self.registry.action( 'Replace old_str with new_str in file_name. old_str must exactly match the string to replace in original text. Recommended tool to mark completed items in todo.md or change specific contents in a file.' @@ -839,7 +833,7 @@ Provide the extracted information in a clear, structured format.""" async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem): result = await file_system.replace_file_str(file_name, old_str, new_str) logger.info(f'💾 {result}') - return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result) + return ActionResult(extracted_content=result, long_term_memory=result) @self.registry.action('Read file_name from file system') async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem): @@ -866,7 +860,6 @@ Provide the extracted information in a clear, structured format.""" logger.info(f'💾 {memory}') return ActionResult( extracted_content=result, - include_in_memory=True, long_term_memory=memory, include_extracted_content_only_once=True, ) From 4c10628ff2549d97234cd8e46645ac69cf41fc22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:35:02 -0700 Subject: [PATCH 18/69] Dont open new tab, if we are already inside a new tab --- browser_use/browser/session.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 5452247d1..e8c24acad 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -536,6 +536,18 @@ class BrowserSession(BaseModel): target_id = None + # If new_tab=True but we're already in a new tab, set new_tab=False + if event.new_tab: + try: + current_url = await self.get_current_page_url() + from browser_use.utils import is_new_tab_page + + if is_new_tab_page(current_url): + self.logger.debug(f'[on_NavigateToUrlEvent] Already in new tab ({current_url}), setting new_tab=False') + event.new_tab = False + except Exception as e: + self.logger.debug(f'[on_NavigateToUrlEvent] Could not check current URL: {e}') + # check if the url is already open in a tab somewhere that we're not currently on, if so, short-circuit and just switch to it targets = await self._cdp_get_all_pages() for target in targets: From 34a69a0acb950a47ce0fa7e9e54f6f5b64ad938e Mon Sep 17 00:00:00 2001 From: Saurav Panda Date: Fri, 29 Aug 2025 11:58:51 -0700 Subject: [PATCH 19/69] fix: add highlighting condition check for disable_security browser profile setting --- browser_use/browser/watchdogs/dom_watchdog.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index bafa37ada..e39365005 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -451,8 +451,8 @@ class DOMWatchdog(BaseWatchdog): self.browser_session.update_cached_selector_map(self.selector_map) self.logger.debug(f'🔍 DOMWatchdog._build_dom_tree: ✅ Selector maps updated, {len(self.selector_map)} elements') - # Inject highlighting for visual feedback if we have elements - if self.selector_map and self._dom_service: + # Inject highlighting for visual feedback if we have elements and highlighting is enabled + if self.selector_map and self._dom_service and self.browser_session.browser_profile.highlight_elements: try: self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Injecting highlighting script...') from browser_use.dom.debug.highlights import inject_highlighting_script @@ -463,6 +463,8 @@ class DOMWatchdog(BaseWatchdog): ) except Exception as e: self.logger.debug(f'🔍 DOMWatchdog._build_dom_tree: Failed to inject highlighting: {e}') + elif self.selector_map and self._dom_service and not self.browser_session.browser_profile.highlight_elements: + self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Skipping highlighting injection - highlight_elements=False') self.logger.debug('🔍 DOMWatchdog._build_dom_tree: ✅ COMPLETED DOM tree build') return self.current_dom_state From 01da257d0471fcfa436206a92c2a201847fcab71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:01:11 -0700 Subject: [PATCH 20/69] Simplify initial_actions & url preloading --- browser_use/agent/service.py | 46 +++++++++--------------------------- 1 file changed, 11 insertions(+), 35 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 473361605..31a6f53bb 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -270,8 +270,17 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Action setup self._setup_action_models() self._set_browser_use_version_and_source(source) - self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None + initial_url = None + + # only load url if no initial actions are provided + if self.directly_open_url and not self.state.follow_up_task and not initial_actions: + initial_url = self._extract_url_from_task(self.task) + if initial_url: + self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...') + initial_actions = [{'go_to_url': {'url': initial_url, 'new_tab': False}}] + + self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None # Verify we can connect to the model self._verify_and_setup_llm() @@ -1239,39 +1248,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug('🔧 Browser session started with watchdogs attached') - # Check if task contains a URL and add it as an initial action (only if directly_open_url is enabled) - if self.directly_open_url and not self.state.follow_up_task: - initial_url = self._extract_url_from_task(self.task) - if initial_url: - self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...') - - # Create a go_to_url action for the initial URL - go_to_url_action = { - 'go_to_url': { - 'url': initial_url, - 'new_tab': False, # Navigate in current tab - } - } - - # Add to initial_actions or create new list if none exist - if self.initial_actions: - # Convert back to dict format, prepend URL navigation, then convert back - initial_actions_dicts = [] - for action in self.initial_actions: - action_data = action.model_dump(exclude_unset=True) - initial_actions_dicts.append(action_data) - - # Prepend the go_to_url action - initial_actions_dicts = [go_to_url_action] + initial_actions_dicts - - # Convert back to ActionModel instances - self.initial_actions = self._convert_initial_actions(initial_actions_dicts) - else: - # Create new initial_actions with just the go_to_url - self.initial_actions = self._convert_initial_actions([go_to_url_action]) - - self.logger.debug(f'✅ Added navigation to {initial_url} as initial action') - # Ensure browser focus is properly established before executing initial actions if self.browser_session and self.browser_session.agent_focus: self.logger.debug(f'🎯 Browser focus established on target: {self.browser_session.agent_focus.target_id[-4:]}') @@ -1283,7 +1259,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...') result = await self.multi_act(self.initial_actions, check_for_new_elements=False) self.state.last_result = result - self.logger.debug('✅ Initial actions completed') + self.logger.debug('Initial actions completed') self.logger.debug(f'🔄 Starting main execution loop with max {max_steps} steps...') for step in range(max_steps): From 84cdd347437c43d5d274c4c8981d5c7fd006a601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:33:51 -0700 Subject: [PATCH 21/69] Return the agent the result from init actions --- browser_use/agent/message_manager/service.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index ffe426ac6..99e48aff8 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -212,10 +212,16 @@ class MessageManager: # Build the history item if model_output is None: - # Only add error history item if we have a valid step number - if step_number is not None and step_number > 0: - history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.') - self.state.agent_history_items.append(history_item) + # Add history item for initial actions (step 0) or errors (step > 0) + if step_number is not None: + if step_number == 0 and action_results: + # Step 0 with initial action results + history_item = HistoryItem(step_number=step_number, action_results=action_results) + self.state.agent_history_items.append(history_item) + elif step_number > 0: + # Error case for steps > 0 + history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.') + self.state.agent_history_items.append(history_item) else: history_item = HistoryItem( step_number=step_number, From fbba5ba035e62d77a69106a3b7fb909e672fb23d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:34:04 -0700 Subject: [PATCH 22/69] Dont cache the first step --- browser_use/agent/service.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 31a6f53bb..ec4038f15 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -644,14 +644,10 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...') # Always take screenshots for all steps - # Use caching based on directly_open_url setting - if directly_open_url is False, don't use cached state - is_first_step = self.state.n_steps in (0, 1) - use_cache = is_first_step and self.directly_open_url - self.logger.debug(f'📸 Requesting browser state with include_screenshot=True, cached={use_cache}') + self.logger.debug('📸 Requesting browser state with include_screenshot=True') browser_state_summary = await self.browser_session.get_browser_state_summary( cache_clickable_elements_hashes=True, include_screenshot=True, # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway) - cached=use_cache, include_recent_events=self.include_recent_events, ) if browser_state_summary.screenshot: From c70abf6cf6a92d5636c6e22b6a2d8cda77d58f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:52:19 -0700 Subject: [PATCH 23/69] update result 1 to mention that its was automatically loaded --- browser_use/agent/service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index ec4038f15..144d946a7 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -280,6 +280,8 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...') initial_actions = [{'go_to_url': {'url': initial_url, 'new_tab': False}}] + self.initial_url = initial_url + self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None # Verify we can connect to the model self._verify_and_setup_llm() @@ -1254,6 +1256,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): if self.initial_actions and not self.state.follow_up_task: self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...') result = await self.multi_act(self.initial_actions, check_for_new_elements=False) + # update result 1 to mention that its was automatically loaded + if result and self.initial_url and result[0].long_term_memory: + result[0].long_term_memory = f'Found initial url and automatically loaded it. {result[0].long_term_memory}' self.state.last_result = result self.logger.debug('Initial actions completed') From 35fe73cba1b3ab476d8e5e48452d5e4341f87fc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 13:09:51 -0700 Subject: [PATCH 24/69] Only switch tab based on target it --- browser_use/tools/service.py | 9 ++------- browser_use/tools/views.py | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index bd9e73b09..492c8c2c5 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -497,12 +497,7 @@ class Tools(Generic[Context]): async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession): # Dispatch switch tab event try: - if params.tab_id: - target_id = await browser_session.get_target_id_from_tab_id(params.tab_id) - elif params.url: - target_id = await browser_session.get_target_id_from_url(params.url) - else: - target_id = await browser_session.get_most_recently_opened_target_id() + target_id = await browser_session.get_target_id_from_tab_id(params.tab_id) event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=target_id)) await event @@ -514,7 +509,7 @@ class Tools(Generic[Context]): except Exception as e: logger.error(f'Failed to switch tab: {type(e).__name__}: {e}') clean_msg = extract_llm_error_message(e) - return ActionResult(error=f'Failed to switch to tab {params.tab_id or params.url}: {clean_msg}') + return ActionResult(error=f'Failed to switch to tab {params.tab_id}: {clean_msg}') @self.registry.action('Close an existing tab', param_model=CloseTabAction) async def close_tab(params: CloseTabAction, browser_session: BrowserSession): diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index a4d7a678b..c25c47c2a 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -43,15 +43,10 @@ class StructuredOutputAction(BaseModel, Generic[T]): class SwitchTabAction(BaseModel): - url: str | None = Field( - default=None, - description='URL or URL substring of the tab to switch to, if not provided, the tab_id or most recently opened tab will be used', - ) - tab_id: str | None = Field( - default=None, + tab_id: str = Field( min_length=4, max_length=4, - description='exact 4 character Tab ID to match instead of URL, prefer using this if known', + description='Last 4 chars of TargetID', ) # last 4 chars of TargetID From babe510c30bd4d574fdb3022c9f0fb22a85f915f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 13:41:25 -0700 Subject: [PATCH 25/69] When closing tabs, first check if we need to open a new one --- browser_use/browser/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index e8c24acad..527992530 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -698,8 +698,8 @@ class BrowserSession(BaseModel): """Handle tab closure - update focus if needed.""" cdp_session = await self.get_or_create_cdp_session(target_id=None, focus=False) + event = await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id)) await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id}) - await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id)) async def on_TabClosedEvent(self, event: TabClosedEvent) -> None: """Handle tab closure - update focus if needed.""" From 9246ca1621d561c6199c4b577a8ab1d3279e9116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:39:12 -0700 Subject: [PATCH 26/69] Remove init scrpt because it was not used --- browser_use/browser/watchdogs/dom_watchdog.py | 65 +------------------ 1 file changed, 1 insertion(+), 64 deletions(-) diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index 077ae36d4..70d0420dc 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -42,70 +42,7 @@ class DOMWatchdog(BaseWatchdog): async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None: # self.logger.debug('Setting up init scripts in browser') - - self.logger.debug('💉 Injecting DOM Service init script to track event listeners added to DOM elements by JS...') - - init_script = """ - // check to make sure we're not inside the PDF viewer - window.isPdfViewer = !!document?.body?.querySelector('body > embed[type="application/pdf"][width="100%"]') - if (!window.isPdfViewer) { - - // Permissions - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - (() => { - if (window._eventListenerTrackerInitialized) return; - window._eventListenerTrackerInitialized = true; - - const originalAddEventListener = EventTarget.prototype.addEventListener; - const eventListenersMap = new WeakMap(); - - EventTarget.prototype.addEventListener = function(type, listener, options) { - if (typeof listener === "function") { - let listeners = eventListenersMap.get(this); - if (!listeners) { - listeners = []; - eventListenersMap.set(this, listeners); - } - - listeners.push({ - type, - listener, - listenerPreview: listener.toString().slice(0, 100), - options - }); - } - - return originalAddEventListener.call(this, type, listener, options); - }; - - window.getEventListenersForNode = (node) => { - const listeners = eventListenersMap.get(node) || []; - return listeners.map(({ type, listenerPreview, options }) => ({ - type, - listenerPreview, - options - })); - }; - })(); - } - """ - - # Try to inject the script, but don't fail if the Page domain isn't ready yet - # This can happen when a new tab is created and the CDP session isn't fully attached - try: - await self.browser_session._cdp_add_init_script(init_script) - except Exception as e: - if "'Page.addScriptToEvaluateOnNewDocument' wasn't found" in str(e): - self.logger.debug(f'Page domain not ready for new tab, skipping init script injection: {e}') - # The script will be injected when the page actually navigates - else: - # Re-raise other errors - raise + return None def _get_recent_events_str(self, limit: int = 10) -> str | None: """Get the most recent events from the event bus as JSON. From e1f95ca574e548ac7e35b289195db4bc5e8f05ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:52:43 -0700 Subject: [PATCH 27/69] create _execute_initial_actions --- browser_use/agent/service.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 144d946a7..b2007c552 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -1252,15 +1252,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): else: self.logger.warning('⚠️ No browser focus established, may cause navigation issues') - # Execute initial actions if provided - if self.initial_actions and not self.state.follow_up_task: - self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...') - result = await self.multi_act(self.initial_actions, check_for_new_elements=False) - # update result 1 to mention that its was automatically loaded - if result and self.initial_url and result[0].long_term_memory: - result[0].long_term_memory = f'Found initial url and automatically loaded it. {result[0].long_term_memory}' - self.state.last_result = result - self.logger.debug('Initial actions completed') + await self._execute_initial_actions() self.logger.debug(f'🔄 Starting main execution loop with max {max_steps} steps...') for step in range(max_steps): @@ -1636,6 +1628,17 @@ class Agent(Generic[Context, AgentStructuredOutput]): return results + async def _execute_initial_actions(self) -> None: + # Execute initial actions if provided + if self.initial_actions and not self.state.follow_up_task: + self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...') + result = await self.multi_act(self.initial_actions, check_for_new_elements=False) + # update result 1 to mention that its was automatically loaded + if result and self.initial_url and result[0].long_term_memory: + result[0].long_term_memory = f'Found initial url and automatically loaded it. {result[0].long_term_memory}' + self.state.last_result = result + self.logger.debug('Initial actions completed') + async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]: """Execute a single step from history with element validation""" assert self.browser_session is not None, 'BrowserSession is not set up' From 8be1a17774b48742e5ed69f05c4e52fd42c82b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:57:58 -0700 Subject: [PATCH 28/69] Fix dont reasign event in on_CloseTabEvent --- browser_use/browser/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 527992530..4f83bfa92 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -698,7 +698,7 @@ class BrowserSession(BaseModel): """Handle tab closure - update focus if needed.""" cdp_session = await self.get_or_create_cdp_session(target_id=None, focus=False) - event = await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id)) + await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id)) await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id}) async def on_TabClosedEvent(self, event: TabClosedEvent) -> None: From e72d305a244a8d888690718602bd4cb12146b294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:02:19 -0700 Subject: [PATCH 29/69] Fix test --- tests/ci/test_browser_event_ClickElementEvent.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci/test_browser_event_ClickElementEvent.py b/tests/ci/test_browser_event_ClickElementEvent.py index 97553e580..435c0ac7b 100644 --- a/tests/ci/test_browser_event_ClickElementEvent.py +++ b/tests/ci/test_browser_event_ClickElementEvent.py @@ -185,11 +185,11 @@ class TestClickElementEvent: # Verify the result structure assert isinstance(result, ActionResult), 'Result should be an ActionResult instance' assert result.error is None, f'Expected no error but got: {result.error}' - + result_text = result.extracted_content or result.long_term_memory # Core logic validation: Verify click was successful - assert result.extracted_content is not None - assert f'Clicked element with index {button_index}' in result.extracted_content, ( - f'Expected click confirmation in result content, got: {result.extracted_content}' + assert result_text is not None + assert f'Clicked element with index {button_index}' in result_text, ( + f'Expected click confirmation in result content, got: {result_text}' ) # Note: The click action doesn't include button text in the result, only the index From 7eb30f68426b0c21e998fe72e456afe6aaff8de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:05:10 -0700 Subject: [PATCH 30/69] Update browser_use/browser/watchdogs/downloads_watchdog.py Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com> --- browser_use/browser/watchdogs/downloads_watchdog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/browser/watchdogs/downloads_watchdog.py b/browser_use/browser/watchdogs/downloads_watchdog.py index eeec2555d..a5a5ba53c 100644 --- a/browser_use/browser/watchdogs/downloads_watchdog.py +++ b/browser_use/browser/watchdogs/downloads_watchdog.py @@ -654,7 +654,7 @@ class DownloadsWatchdog(BaseWatchdog): return False async def _check_network_headers_for_pdf(self, target_id: TargetID) -> bool: - """Check network response headers for PDF content-type.""" + """Infer PDF via navigation history/URL; headers are not available post-navigation in this context.""" try: import asyncio From bb3055e7bb53e065b809ba1e30c42ea55ca06d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 16:51:04 -0700 Subject: [PATCH 31/69] Emulate real human typing --- .../watchdogs/default_action_watchdog.py | 90 +++++++++++++++---- 1 file changed, 75 insertions(+), 15 deletions(-) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 3297cbc52..836ef2456 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -583,6 +583,57 @@ class DefaultActionWatchdog(BaseWatchdog): except Exception as e: raise Exception(f'Failed to type to page: {str(e)}') + def _get_key_code_for_char(self, char: str) -> str: + """Get the proper key code for a character (like Playwright does).""" + # Key code mapping for common characters + key_codes = { + ' ': 'Space', + '.': 'Period', + ',': 'Comma', + '-': 'Minus', + '_': 'Underscore', + '@': 'At', + '!': 'Exclamation', + '?': 'Question', + ':': 'Colon', + ';': 'Semicolon', + '(': 'ParenLeft', + ')': 'ParenRight', + '[': 'BracketLeft', + ']': 'BracketRight', + '{': 'BraceLeft', + '}': 'BraceRight', + '/': 'Slash', + '\\': 'Backslash', + '=': 'Equal', + '+': 'Plus', + '*': 'Asterisk', + '&': 'Ampersand', + '%': 'Percent', + '$': 'Dollar', + '#': 'Hash', + '^': 'Caret', + '~': 'Tilde', + '`': 'Backquote', + "'": 'Quote', + '"': 'DoubleQuote', + } + + # Numbers + if char.isdigit(): + return f'Digit{char}' + + # Letters + if char.isalpha(): + return f'Key{char.upper()}' + + # Special characters + if char in key_codes: + return key_codes[char] + + # Fallback for unknown characters + return f'Key{char.upper()}' + async def _check_element_focusability(self, element_node, object_id: str, session_id: str) -> dict[str, Any]: """ Check if an element is likely to be focusable and visible. @@ -733,7 +784,7 @@ class DefaultActionWatchdog(BaseWatchdog): # Strategy 1: Try CDP DOM.focus (original method) try: - await cdp_session.cdp_client.send.DOM.focus( + result = await cdp_session.cdp_client.send.DOM.focus( params={'backendNodeId': backend_node_id}, session_id=cdp_session.session_id, ) @@ -808,35 +859,44 @@ class DefaultActionWatchdog(BaseWatchdog): if not focused_successfully: self.logger.warning('⚠️ All focus strategies failed, typing without explicit focus') - # Type the text character by character - for char in text: - # Send keydown (without text to avoid duplication) + # Type the text character by character using proper human-like key events + # This emulates exactly how a human would type, which modern websites expect + self.logger.debug(f'🎯 Typing text character by character: "{text}"') + + for i, char in enumerate(text): + # Get proper key code for the character + key_code = self._get_key_code_for_char(char) + + # self.logger.debug(f'🎯 Typing character {i + 1}/{len(text)}: "{char}" (code: {key_code})') + + # Send keyDown event (this is what humans do when pressing a key) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ 'type': 'keyDown', - 'key': char, - }, - session_id=cdp_session.session_id, - ) - # Send char (for actual text input) - await cdp_session.cdp_client.send.Input.dispatchKeyEvent( - params={ - 'type': 'char', 'text': char, 'key': char, + 'code': key_code, + 'windowsVirtualKeyCode': ord(char.upper()) if char.isalpha() else ord(char), }, session_id=cdp_session.session_id, ) - # Send keyup (without text to avoid duplication) + + # Small delay to emulate human typing speed + await asyncio.sleep(0.001) # 50ms between key down and key up + + # Send keyUp event (this is what humans do when releasing a key) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ 'type': 'keyUp', 'key': char, + 'code': key_code, + 'windowsVirtualKeyCode': ord(char.upper()) if char.isalpha() else ord(char), }, session_id=cdp_session.session_id, ) - # Small delay between characters - await asyncio.sleep(0.01) + + # Small delay between characters to look human (realistic typing speed) + await asyncio.sleep(0.001) # 80ms between characters = ~150 WPM typing speed # Return coordinates metadata if available return input_coordinates From d80549038b257a8a2eb13923ccb4d1f4975fea1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:01:34 -0700 Subject: [PATCH 32/69] Enhance element interaction handling - Added logging for new elements detected during actions in the Agent class. - Implemented a human-like text field clearing method in DefaultActionWatchdog, utilizing Ctrl+A and Backspace. - Improved focus handling for label elements, ensuring they are only interactive if they do not have a 'for' attribute. - Updated clickable element detection logic to account for labels pointing to inputs. These changes improve the robustness of user interactions and enhance debugging capabilities. --- browser_use/agent/service.py | 2 + .../watchdogs/default_action_watchdog.py | 349 ++++++++++-------- .../dom/serializer/clickable_elements.py | 11 +- 3 files changed, 199 insertions(+), 163 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index b2007c552..c75663c58 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -1494,6 +1494,8 @@ class Agent(Generic[Context, AgentStructuredOutput]): new_element_hashes = {e.parent_branch_hash() for e in new_selector_map.values()} if check_for_new_elements and not new_element_hashes.issubset(cached_element_hashes): # next action requires index but there are new elements on the page + # log difference in len debug + self.logger.debug(f'New elements: {abs(len(new_element_hashes) - len(cached_element_hashes))}') remaining_actions_str = get_remaining_actions_str(actions, i) msg = f'Something new appeared after action {i} / {total_actions}: actions {remaining_actions_str} were not executed' logger.info(msg) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 836ef2456..132ecb3ea 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -3,7 +3,6 @@ import asyncio import json import platform -from typing import Any from browser_use.browser.events import ( ClickElementEvent, @@ -634,78 +633,139 @@ class DefaultActionWatchdog(BaseWatchdog): # Fallback for unknown characters return f'Key{char.upper()}' - async def _check_element_focusability(self, element_node, object_id: str, session_id: str) -> dict[str, Any]: - """ - Check if an element is likely to be focusable and visible. - - Returns: - Dict with keys: 'visible', 'focusable', 'interactive', 'disabled' - """ + async def _clear_text_field(self, object_id: str, cdp_session) -> None: + """Clear text field using human-like Ctrl+A + Backspace approach.""" try: - cdp_client = self.browser_session.cdp_client + self.logger.debug('🧹 Clearing text field using Ctrl+A + Backspace') - # Run comprehensive element checks via JavaScript - check_result = await cdp_client.send.Runtime.callFunctionOn( + # Select all text (Ctrl+A) + await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ - 'functionDeclaration': """ - function() { - const element = this; - const computedStyle = window.getComputedStyle(element); - const rect = element.getBoundingClientRect(); - - // Check basic visibility - const isVisible = rect.width > 0 && rect.height > 0 && - computedStyle.visibility !== 'hidden' && - computedStyle.display !== 'none' && - computedStyle.opacity !== '0'; - - // Check if element is disabled - const isDisabled = element.disabled || element.hasAttribute('disabled') || - element.getAttribute('aria-disabled') === 'true'; - - // Check if element is focusable by tag and attributes - const focusableTags = ['input', 'textarea', 'select', 'button', 'a']; - const hasFocusableTag = focusableTags.includes(element.tagName.toLowerCase()); - const hasTabIndex = element.hasAttribute('tabindex') && element.tabIndex >= 0; - const isContentEditable = element.contentEditable === 'true'; - - const isFocusable = !isDisabled && (hasFocusableTag || hasTabIndex || isContentEditable); - - // Check if element is interactive (clickable/editable) - const isInteractive = isFocusable || element.onclick !== null || - element.getAttribute('role') === 'button' || - element.classList.contains('clickable'); - - return { - visible: isVisible, - focusable: isFocusable, - interactive: isInteractive, - disabled: isDisabled, - bounds: { - x: rect.left, - y: rect.top, - width: rect.width, - height: rect.height - }, - tagName: element.tagName.toLowerCase(), - type: element.type || null - }; - } - """, - 'objectId': object_id, - 'returnByValue': True, + 'type': 'keyDown', + 'key': 'a', + 'code': 'KeyA', + 'modifiers': 2, # Ctrl modifier + 'windowsVirtualKeyCode': 65, }, - session_id=session_id, + session_id=cdp_session.session_id, + ) + await cdp_session.cdp_client.send.Input.dispatchKeyEvent( + params={ + 'type': 'keyUp', + 'key': 'a', + 'code': 'KeyA', + 'modifiers': 2, # Ctrl modifier + 'windowsVirtualKeyCode': 65, + }, + session_id=cdp_session.session_id, ) - if 'result' in check_result and 'value' in check_result['result']: - return check_result['result']['value'] - else: - self.logger.debug('Element focusability check returned no results') - return {'visible': False, 'focusable': False, 'interactive': False, 'disabled': True} + # Small delay + await asyncio.sleep(0.01) + + # Delete selected text (Backspace) + await cdp_session.cdp_client.send.Input.dispatchKeyEvent( + params={ + 'type': 'keyDown', + 'key': 'Backspace', + 'code': 'Backspace', + 'windowsVirtualKeyCode': 8, + }, + session_id=cdp_session.session_id, + ) + await cdp_session.cdp_client.send.Input.dispatchKeyEvent( + params={ + 'type': 'keyUp', + 'key': 'Backspace', + 'code': 'Backspace', + 'windowsVirtualKeyCode': 8, + }, + session_id=cdp_session.session_id, + ) + + self.logger.debug('✅ Text field cleared successfully') + except Exception as e: - self.logger.debug(f'Element focusability check failed: {e}') - return {'visible': False, 'focusable': False, 'interactive': False, 'disabled': True} + self.logger.debug(f'Failed to clear text field: {e}') + # Try JavaScript fallback + try: + await cdp_session.cdp_client.send.Runtime.callFunctionOn( + params={ + 'functionDeclaration': 'function() { if (this.value !== undefined) this.value = ""; }', + 'objectId': object_id, + }, + session_id=cdp_session.session_id, + ) + self.logger.debug('✅ Text field cleared using JavaScript fallback') + except Exception as js_e: + self.logger.debug(f'JavaScript clear also failed: {js_e}') + + async def _focus_element_simple( + self, backend_node_id: int, object_id: str, cdp_session, input_coordinates: dict | None = None + ) -> bool: + """Simple focus strategy: CDP first, then click if failed.""" + + # Strategy 1: Try CDP DOM.focus first + try: + result = await cdp_session.cdp_client.send.DOM.focus( + params={'backendNodeId': backend_node_id}, + session_id=cdp_session.session_id, + ) + self.logger.debug(f'CDP DOM.focus result: {result}') + self.logger.debug('✅ Element focused using CDP DOM.focus') + return True + except Exception as e: + # Check for specific CDP "Element is not focusable" error + error_str = str(e).lower() + if ( + 'element is not focusable' in error_str + or "code': -32000" in error_str + or '-32000' in error_str + or 'not focusable' in error_str + ): + self.logger.debug('❌ CDP DOM.focus failed - element not focusable') + else: + self.logger.debug(f'CDP DOM.focus failed: {e}') + + # Strategy 2: Try click to focus if CDP failed + if input_coordinates and 'input_x' in input_coordinates and 'input_y' in input_coordinates: + try: + click_x = input_coordinates['input_x'] + click_y = input_coordinates['input_y'] + + self.logger.debug(f'🎯 Attempting click-to-focus at ({click_x:.1f}, {click_y:.1f})') + + # Click to focus + await cdp_session.cdp_client.send.Input.dispatchMouseEvent( + params={ + 'type': 'mousePressed', + 'x': click_x, + 'y': click_y, + 'button': 'left', + 'clickCount': 1, + }, + session_id=cdp_session.session_id, + ) + await cdp_session.cdp_client.send.Input.dispatchMouseEvent( + params={ + 'type': 'mouseReleased', + 'x': click_x, + 'y': click_y, + 'button': 'left', + 'clickCount': 1, + }, + session_id=cdp_session.session_id, + ) + + self.logger.debug('✅ Element focused using click method') + return True + + except Exception as e: + self.logger.debug(f'Click focus failed: {e}') + + # Both strategies failed + self.logger.warning('⚠️ All focus strategies failed') + return False async def _input_text_element_node_impl(self, element_node, text: str, clear_existing: bool = True) -> dict | None: """ @@ -749,117 +809,82 @@ class DefaultActionWatchdog(BaseWatchdog): ) object_id = result['object']['objectId'] - # Check element focusability before attempting focus - element_info = await self._check_element_focusability(element_node, object_id, cdp_session.session_id) - self.logger.debug(f'Element focusability check: {element_info}') + # Check if this is a label with 'for' attribute - if so, find the actual input + actual_element = element_node + if element_node.tag_name.lower() == 'label' and element_node.attributes and element_node.attributes.get('for'): + input_id = element_node.attributes['for'] + self.logger.debug(f'🏷️ Label detected with for="{input_id}", searching for associated input element') - # Extract coordinates from element bounds for metadata - bounds = element_info.get('bounds', {}) - if bounds.get('width', 0) > 0 and bounds.get('height', 0) > 0: - center_x = bounds['x'] + bounds['width'] / 2 - center_y = bounds['y'] + bounds['height'] / 2 - input_coordinates = {'input_x': center_x, 'input_y': center_y} - self.logger.debug(f'Input coordinates: x={center_x:.1f}, y={center_y:.1f}') - - # Provide helpful warnings for common issues - if not element_info.get('visible', False): - self.logger.warning('⚠️ Target element appears to be invisible or has zero dimensions') - if element_info.get('disabled', False): - self.logger.warning('⚠️ Target element appears to be disabled') - if not element_info.get('focusable', False): - self.logger.warning('⚠️ Target element may not be focusable by standard criteria') - - # Clear existing text if requested - if clear_existing: - await cdp_session.cdp_client.send.Runtime.callFunctionOn( - params={ - 'functionDeclaration': 'function() { if (this.value !== undefined) this.value = ""; if (this.textContent !== undefined) this.textContent = ""; }', - 'objectId': object_id, - }, - session_id=cdp_session.session_id, - ) - - # Try multiple focus strategies - focused_successfully = False - - # Strategy 1: Try CDP DOM.focus (original method) - try: - result = await cdp_session.cdp_client.send.DOM.focus( - params={'backendNodeId': backend_node_id}, - session_id=cdp_session.session_id, - ) - focused_successfully = True - self.logger.debug('✅ Element focused using CDP DOM.focus') - except Exception as e: - self.logger.debug(f'CDP DOM.focus failed: {e}') - - # Strategy 2: Try JavaScript focus as fallback + # Find the input element by ID using JavaScript try: - await cdp_session.cdp_client.send.Runtime.callFunctionOn( + input_result = await cdp_session.cdp_client.send.Runtime.evaluate( params={ - 'functionDeclaration': 'function() { this.focus(); }', - 'objectId': object_id, + 'expression': f'document.getElementById({json.dumps(input_id)})', + 'returnByValue': False, # Return object reference }, session_id=cdp_session.session_id, ) - focused_successfully = True - self.logger.debug('✅ Element focused using JavaScript focus()') - except Exception as js_e: - self.logger.debug(f'JavaScript focus failed: {js_e}') - # Strategy 3: Try click-to-focus for stubborn elements - try: - await cdp_session.cdp_client.send.Runtime.callFunctionOn( + input_object_id = input_result.get('result', {}).get('objectId') + if input_object_id: + self.logger.debug(f'✅ Found associated input element with id="{input_id}"') + # Use the input element instead of the label + object_id = input_object_id + + # Get the input element's bounding box for coordinates + bbox_result = await cdp_session.cdp_client.send.Runtime.callFunctionOn( params={ - 'functionDeclaration': 'function() { this.click(); this.focus(); }', - 'objectId': object_id, + 'functionDeclaration': 'function() { return this.getBoundingClientRect(); }', + 'objectId': input_object_id, + 'returnByValue': True, }, session_id=cdp_session.session_id, ) - focused_successfully = True - self.logger.debug('✅ Element focused using click + focus combination') - except Exception as click_e: - self.logger.debug(f'Click + focus failed: {click_e}') - # Strategy 4: Try simulated mouse click for maximum compatibility - try: - # Use coordinates already calculated from element bounds - if input_coordinates and 'input_x' in input_coordinates and 'input_y' in input_coordinates: - click_x = input_coordinates['input_x'] - click_y = input_coordinates['input_y'] + bbox_value = bbox_result.get('result', {}).get('value') + if bbox_value: + center_x = bbox_value['x'] + bbox_value['width'] / 2 + center_y = bbox_value['y'] + bbox_value['height'] / 2 + input_coordinates = {'input_x': center_x, 'input_y': center_y} + self.logger.debug(f'✅ Using input coordinates: x={center_x:.1f}, y={center_y:.1f}') + else: + self.logger.warning(f'❌ Could not find input element with id="{input_id}", using label coordinates') + except Exception as e: + self.logger.warning(f'❌ Error finding input element: {e}, using label coordinates') - await cdp_session.cdp_client.send.Input.dispatchMouseEvent( - params={ - 'type': 'mousePressed', - 'x': click_x, - 'y': click_y, - 'button': 'left', - 'clickCount': 1, - }, - session_id=cdp_session.session_id, - ) - await cdp_session.cdp_client.send.Input.dispatchMouseEvent( - params={ - 'type': 'mouseReleased', - 'x': click_x, - 'y': click_y, - 'button': 'left', - 'clickCount': 1, - }, - session_id=cdp_session.session_id, - ) - focused_successfully = True - self.logger.debug('✅ Element focused using simulated mouse click') - else: - self.logger.debug('Element bounds not available for mouse click') - except Exception as mouse_e: - self.logger.debug(f'Simulated mouse click failed: {mouse_e}') + # Use actual_element coordinates directly (it already has them) if not already set by label-to-input mapping + if input_coordinates is None: + if hasattr(actual_element, 'center_x') and hasattr(actual_element, 'center_y'): + center_x = actual_element.center_x + center_y = actual_element.center_y + input_coordinates = {'input_x': center_x, 'input_y': center_y} + self.logger.debug(f'Using element coordinates: x={center_x:.1f}, y={center_y:.1f}') + else: + # Fallback: calculate from bounding box if available + if hasattr(actual_element, 'bounding_box') and actual_element.bounding_box: + bbox = actual_element.bounding_box + center_x = bbox.x + bbox.width / 2 + center_y = bbox.y + bbox.height / 2 + input_coordinates = {'input_x': center_x, 'input_y': center_y} + self.logger.debug(f'Calculated coordinates from bbox: x={center_x:.1f}, y={center_y:.1f}') + else: + input_coordinates = None + self.logger.warning('⚠️ No coordinates available for element') - # Log focus result - if not focused_successfully: - self.logger.warning('⚠️ All focus strategies failed, typing without explicit focus') + # Ensure we have a valid object_id before proceeding + if not object_id: + raise ValueError('Could not get object_id for element') - # Type the text character by character using proper human-like key events + # Step 1: Focus the element using simple strategy + focused_successfully = await self._focus_element_simple( + backend_node_id=backend_node_id, object_id=object_id, cdp_session=cdp_session, input_coordinates=input_coordinates + ) + + # Step 2: Clear existing text if requested + if clear_existing: + await self._clear_text_field(object_id=object_id, cdp_session=cdp_session) + + # Step 3: Type the text character by character using proper human-like key events # This emulates exactly how a human would type, which modern websites expect self.logger.debug(f'🎯 Typing text character by character: "{text}"') diff --git a/browser_use/dom/serializer/clickable_elements.py b/browser_use/dom/serializer/clickable_elements.py index 836dba8ca..4e33dafbb 100644 --- a/browser_use/dom/serializer/clickable_elements.py +++ b/browser_use/dom/serializer/clickable_elements.py @@ -101,7 +101,6 @@ class ClickableElementDetector: 'select', 'textarea', 'a', - 'label', 'details', 'summary', 'option', @@ -110,6 +109,16 @@ class ClickableElementDetector: if node.tag_name in interactive_tags: return True + # Special handling for labels: only interactive if they DON'T have a 'for' attribute + # Labels with 'for' attribute should not be interactive - their associated input should be + if node.tag_name == 'label': + if node.attributes and node.attributes.get('for'): + # This label points to an input - it should NOT be interactive + return False + else: + # This label doesn't point to an input - it might be clickable itself + return True + # SVG elements need special handling - only interactive if they have explicit handlers # svg_tags = {'svg', 'path', 'circle', 'rect', 'polygon', 'ellipse', 'line', 'polyline', 'g'} # if node.tag_name in svg_tags: From b012d21de3b416afa83e8cdf6d1b54a668cb37fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:05:32 -0700 Subject: [PATCH 33/69] Refine clickable element detection logic - Enhanced the tag check to include truly interactive elements. - Removed special handling for 'label' elements, as they are now managed by other attribute checks to prevent interference with clickable elements. These updates improve the accuracy of interactive element detection in the DOM serializer. --- browser_use/dom/serializer/clickable_elements.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/browser_use/dom/serializer/clickable_elements.py b/browser_use/dom/serializer/clickable_elements.py index 4e33dafbb..d16c07e07 100644 --- a/browser_use/dom/serializer/clickable_elements.py +++ b/browser_use/dom/serializer/clickable_elements.py @@ -94,7 +94,8 @@ class ClickableElementDetector: # Skip properties we can't process continue - # ENHANCED TAG CHECK: Include truly interactive elements + # ENHANCED TAG CHECK: Include truly interactive elements + # Note: 'label' removed - labels are handled by other attribute checks below - other wise labels with "for" attribute can destry the real clickable element on appartments.com interactive_tags = { 'button', 'input', @@ -109,16 +110,6 @@ class ClickableElementDetector: if node.tag_name in interactive_tags: return True - # Special handling for labels: only interactive if they DON'T have a 'for' attribute - # Labels with 'for' attribute should not be interactive - their associated input should be - if node.tag_name == 'label': - if node.attributes and node.attributes.get('for'): - # This label points to an input - it should NOT be interactive - return False - else: - # This label doesn't point to an input - it might be clickable itself - return True - # SVG elements need special handling - only interactive if they have explicit handlers # svg_tags = {'svg', 'path', 'circle', 'rect', 'polygon', 'ellipse', 'line', 'polyline', 'g'} # if node.tag_name in svg_tags: From 662a39742eb2b20c9f80c297578d2cf8633d43d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:08:37 -0700 Subject: [PATCH 34/69] Use existing method to get coordinates --- .../watchdogs/default_action_watchdog.py | 74 +++---------------- 1 file changed, 12 insertions(+), 62 deletions(-) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 132ecb3ea..4f71c3fb1 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -767,7 +767,9 @@ class DefaultActionWatchdog(BaseWatchdog): self.logger.warning('⚠️ All focus strategies failed') return False - async def _input_text_element_node_impl(self, element_node, text: str, clear_existing: bool = True) -> dict | None: + async def _input_text_element_node_impl( + self, element_node: EnhancedDOMTreeNode, text: str, clear_existing: bool = True + ) -> dict | None: """ Input text into an element using pure CDP with improved focus fallbacks. """ @@ -809,67 +811,15 @@ class DefaultActionWatchdog(BaseWatchdog): ) object_id = result['object']['objectId'] - # Check if this is a label with 'for' attribute - if so, find the actual input - actual_element = element_node - if element_node.tag_name.lower() == 'label' and element_node.attributes and element_node.attributes.get('for'): - input_id = element_node.attributes['for'] - self.logger.debug(f'🏷️ Label detected with for="{input_id}", searching for associated input element') - - # Find the input element by ID using JavaScript - try: - input_result = await cdp_session.cdp_client.send.Runtime.evaluate( - params={ - 'expression': f'document.getElementById({json.dumps(input_id)})', - 'returnByValue': False, # Return object reference - }, - session_id=cdp_session.session_id, - ) - - input_object_id = input_result.get('result', {}).get('objectId') - if input_object_id: - self.logger.debug(f'✅ Found associated input element with id="{input_id}"') - # Use the input element instead of the label - object_id = input_object_id - - # Get the input element's bounding box for coordinates - bbox_result = await cdp_session.cdp_client.send.Runtime.callFunctionOn( - params={ - 'functionDeclaration': 'function() { return this.getBoundingClientRect(); }', - 'objectId': input_object_id, - 'returnByValue': True, - }, - session_id=cdp_session.session_id, - ) - - bbox_value = bbox_result.get('result', {}).get('value') - if bbox_value: - center_x = bbox_value['x'] + bbox_value['width'] / 2 - center_y = bbox_value['y'] + bbox_value['height'] / 2 - input_coordinates = {'input_x': center_x, 'input_y': center_y} - self.logger.debug(f'✅ Using input coordinates: x={center_x:.1f}, y={center_y:.1f}') - else: - self.logger.warning(f'❌ Could not find input element with id="{input_id}", using label coordinates') - except Exception as e: - self.logger.warning(f'❌ Error finding input element: {e}, using label coordinates') - - # Use actual_element coordinates directly (it already has them) if not already set by label-to-input mapping - if input_coordinates is None: - if hasattr(actual_element, 'center_x') and hasattr(actual_element, 'center_y'): - center_x = actual_element.center_x - center_y = actual_element.center_y - input_coordinates = {'input_x': center_x, 'input_y': center_y} - self.logger.debug(f'Using element coordinates: x={center_x:.1f}, y={center_y:.1f}') - else: - # Fallback: calculate from bounding box if available - if hasattr(actual_element, 'bounding_box') and actual_element.bounding_box: - bbox = actual_element.bounding_box - center_x = bbox.x + bbox.width / 2 - center_y = bbox.y + bbox.height / 2 - input_coordinates = {'input_x': center_x, 'input_y': center_y} - self.logger.debug(f'Calculated coordinates from bbox: x={center_x:.1f}, y={center_y:.1f}') - else: - input_coordinates = None - self.logger.warning('⚠️ No coordinates available for element') + # Use element_node absolute_position coordinates (correct coordinates including iframe offsets) + if element_node.absolute_position: + center_x = element_node.absolute_position.x + element_node.absolute_position.width / 2 + center_y = element_node.absolute_position.y + element_node.absolute_position.height / 2 + input_coordinates = {'input_x': center_x, 'input_y': center_y} + self.logger.debug(f'Using absolute_position coordinates: x={center_x:.1f}, y={center_y:.1f}') + else: + input_coordinates = None + self.logger.warning('⚠️ No absolute_position available for element') # Ensure we have a valid object_id before proceeding if not object_id: From 4e80786015a2771fe4ba44c38af8aa5c1c68da52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:22:43 -0700 Subject: [PATCH 35/69] Refactor DefaultActionWatchdog for improved focus handling - Added type hint for CDPSession in the _focus_element_simple method. - Enhanced logging for focus attempts, including exception details. - Reduced sleep duration in scrollIntoViewIfNeeded for better performance. - Updated text clearing logic to ensure it only occurs after successful focus. These changes enhance the robustness of element interaction and improve debugging capabilities. --- .../watchdogs/default_action_watchdog.py | 28 +++++++------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 4f71c3fb1..5fb8a72d4 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -4,6 +4,8 @@ import asyncio import json import platform +from browser.session import CDPSession + from browser_use.browser.events import ( ClickElementEvent, GetDropdownOptionsEvent, @@ -701,7 +703,7 @@ class DefaultActionWatchdog(BaseWatchdog): self.logger.debug(f'JavaScript clear also failed: {js_e}') async def _focus_element_simple( - self, backend_node_id: int, object_id: str, cdp_session, input_coordinates: dict | None = None + self, backend_node_id: int, object_id: str, cdp_session: CDPSession, input_coordinates: dict | None = None ) -> bool: """Simple focus strategy: CDP first, then click if failed.""" @@ -711,21 +713,11 @@ class DefaultActionWatchdog(BaseWatchdog): params={'backendNodeId': backend_node_id}, session_id=cdp_session.session_id, ) - self.logger.debug(f'CDP DOM.focus result: {result}') - self.logger.debug('✅ Element focused using CDP DOM.focus') + self.logger.debug(f'Element focused using CDP DOM.focus (result: {result})') return True + except Exception as e: - # Check for specific CDP "Element is not focusable" error - error_str = str(e).lower() - if ( - 'element is not focusable' in error_str - or "code': -32000" in error_str - or '-32000' in error_str - or 'not focusable' in error_str - ): - self.logger.debug('❌ CDP DOM.focus failed - element not focusable') - else: - self.logger.debug(f'CDP DOM.focus failed: {e}') + self.logger.debug(f'❌ CDP DOM.focus threw exception: {type(e).__name__}: {e}') # Strategy 2: Try click to focus if CDP failed if input_coordinates and 'input_x' in input_coordinates and 'input_y' in input_coordinates: @@ -795,7 +787,7 @@ class DefaultActionWatchdog(BaseWatchdog): await cdp_session.cdp_client.send.DOM.scrollIntoViewIfNeeded( params={'backendNodeId': backend_node_id}, session_id=cdp_session.session_id ) - await asyncio.sleep(0.1) + await asyncio.sleep(0.01) except Exception as e: self.logger.warning( f'⚠️ Failed to focus the page {cdp_session} and scroll element {element_node} into view before typing in text: {type(e).__name__}: {e}' @@ -831,7 +823,7 @@ class DefaultActionWatchdog(BaseWatchdog): ) # Step 2: Clear existing text if requested - if clear_existing: + if clear_existing and focused_successfully: await self._clear_text_field(object_id=object_id, cdp_session=cdp_session) # Step 3: Type the text character by character using proper human-like key events @@ -857,7 +849,7 @@ class DefaultActionWatchdog(BaseWatchdog): ) # Small delay to emulate human typing speed - await asyncio.sleep(0.001) # 50ms between key down and key up + await asyncio.sleep(0.001) # Send keyUp event (this is what humans do when releasing a key) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( @@ -871,7 +863,7 @@ class DefaultActionWatchdog(BaseWatchdog): ) # Small delay between characters to look human (realistic typing speed) - await asyncio.sleep(0.001) # 80ms between characters = ~150 WPM typing speed + await asyncio.sleep(0.001) # Return coordinates metadata if available return input_coordinates From 1ac519e98fdfa9c4432f294bda686fd23b40ccf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:25:16 -0700 Subject: [PATCH 36/69] Typos --- browser_use/dom/serializer/clickable_elements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/dom/serializer/clickable_elements.py b/browser_use/dom/serializer/clickable_elements.py index d16c07e07..807bbcc01 100644 --- a/browser_use/dom/serializer/clickable_elements.py +++ b/browser_use/dom/serializer/clickable_elements.py @@ -95,7 +95,7 @@ class ClickableElementDetector: continue # ENHANCED TAG CHECK: Include truly interactive elements - # Note: 'label' removed - labels are handled by other attribute checks below - other wise labels with "for" attribute can destry the real clickable element on appartments.com + # Note: 'label' removed - labels are handled by other attribute checks below - other wise labels with "for" attribute can destroy the real clickable element on apartments.com interactive_tags = { 'button', 'input', From 4a8d1e883bf25ccc9668cef02b08efa211ab3915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:29:14 -0700 Subject: [PATCH 37/69] Fix test --- tests/ci/evaluate_tasks.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/ci/evaluate_tasks.py b/tests/ci/evaluate_tasks.py index c02e15506..435ba38dd 100644 --- a/tests/ci/evaluate_tasks.py +++ b/tests/ci/evaluate_tasks.py @@ -17,11 +17,7 @@ import aiofiles import yaml from pydantic import BaseModel -from browser_use.agent.service import Agent -from browser_use.agent.views import AgentHistoryList -from browser_use.browser.profile import BrowserProfile -from browser_use.browser.session import BrowserSession -from browser_use.llm import ChatOpenAI +from browser_use import Agent, AgentHistoryList, BrowserProfile, BrowserSession, ChatOpenAI from browser_use.llm.messages import UserMessage # --- CONFIG --- From 42e6cc80e0f7cd7e76f72153499a8ecb120f97f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:34:03 -0700 Subject: [PATCH 38/69] Remove CDPSession in default watchdog --- browser_use/browser/watchdogs/default_action_watchdog.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 5fb8a72d4..607e5e03f 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -4,8 +4,6 @@ import asyncio import json import platform -from browser.session import CDPSession - from browser_use.browser.events import ( ClickElementEvent, GetDropdownOptionsEvent, @@ -703,7 +701,7 @@ class DefaultActionWatchdog(BaseWatchdog): self.logger.debug(f'JavaScript clear also failed: {js_e}') async def _focus_element_simple( - self, backend_node_id: int, object_id: str, cdp_session: CDPSession, input_coordinates: dict | None = None + self, backend_node_id: int, object_id: str, cdp_session, input_coordinates: dict | None = None ) -> bool: """Simple focus strategy: CDP first, then click if failed.""" From 723c68c20c0fd3d1f25af7735a08b7ffbbd9690f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:38:24 -0700 Subject: [PATCH 39/69] Test fails if 0% --- .github/workflows/test.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b4be461fb..1d8a1f6ac 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -172,6 +172,11 @@ jobs: const score = `${passed}/${total}`; const percentage = Math.round((passed / total) * 100); + // Fail the workflow if 0% pass rate + if (percentage === 0) { + core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`); + } + // Create detailed table let tableRows = ''; detailedResults.forEach(result => { From 46c0dc4fe24edceba18c660b853d42675623f600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:45:20 -0700 Subject: [PATCH 40/69] Refactor key code mapping and text field clearing logic in DefaultActionWatchdog - Updated key code mappings for special characters to reflect correct usage with modifiers. - Enhanced text field clearing method to use platform-specific modifiers (Cmd for macOS, Ctrl for others) for a more human-like interaction. - Removed unnecessary `windowsVirtualKeyCode` assignments for printable characters to prevent incorrect virtual key code usage. These changes improve the accuracy of character input handling and enhance the robustness of text field interactions. --- .../watchdogs/default_action_watchdog.py | 59 ++++++++++--------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 607e5e03f..8f75d7e21 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -584,38 +584,38 @@ class DefaultActionWatchdog(BaseWatchdog): def _get_key_code_for_char(self, char: str) -> str: """Get the proper key code for a character (like Playwright does).""" - # Key code mapping for common characters + # Key code mapping for common characters (using proper base keys + modifiers) key_codes = { ' ': 'Space', '.': 'Period', ',': 'Comma', '-': 'Minus', - '_': 'Underscore', - '@': 'At', - '!': 'Exclamation', - '?': 'Question', - ':': 'Colon', + '_': 'Minus', # Underscore uses Minus with Shift + '@': 'Digit2', # @ uses Digit2 with Shift + '!': 'Digit1', # ! uses Digit1 with Shift (not 'Exclamation') + '?': 'Slash', # ? uses Slash with Shift + ':': 'Semicolon', # : uses Semicolon with Shift ';': 'Semicolon', - '(': 'ParenLeft', - ')': 'ParenRight', + '(': 'Digit9', # ( uses Digit9 with Shift + ')': 'Digit0', # ) uses Digit0 with Shift '[': 'BracketLeft', ']': 'BracketRight', - '{': 'BraceLeft', - '}': 'BraceRight', + '{': 'BracketLeft', # { uses BracketLeft with Shift + '}': 'BracketRight', # } uses BracketRight with Shift '/': 'Slash', '\\': 'Backslash', '=': 'Equal', - '+': 'Plus', - '*': 'Asterisk', - '&': 'Ampersand', - '%': 'Percent', - '$': 'Dollar', - '#': 'Hash', - '^': 'Caret', - '~': 'Tilde', + '+': 'Equal', # + uses Equal with Shift + '*': 'Digit8', # * uses Digit8 with Shift + '&': 'Digit7', # & uses Digit7 with Shift + '%': 'Digit5', # % uses Digit5 with Shift + '$': 'Digit4', # $ uses Digit4 with Shift + '#': 'Digit3', # # uses Digit3 with Shift + '^': 'Digit6', # ^ uses Digit6 with Shift + '~': 'Backquote', # ~ uses Backquote with Shift '`': 'Backquote', "'": 'Quote', - '"': 'DoubleQuote', + '"': 'Quote', # " uses Quote with Shift } # Numbers @@ -636,16 +636,22 @@ class DefaultActionWatchdog(BaseWatchdog): async def _clear_text_field(self, object_id: str, cdp_session) -> None: """Clear text field using human-like Ctrl+A + Backspace approach.""" try: - self.logger.debug('🧹 Clearing text field using Ctrl+A + Backspace') + # Use Meta (Cmd) on macOS, Ctrl on other platforms + import platform - # Select all text (Ctrl+A) + is_macos = platform.system() == 'Darwin' + select_all_modifier = 4 if is_macos else 2 # Meta=4 (Cmd), Ctrl=2 + modifier_name = 'Cmd' if is_macos else 'Ctrl' + + self.logger.debug(f'🧹 Clearing text field using {modifier_name}+A + Backspace') + + # Select all text (Ctrl/Cmd+A) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ 'type': 'keyDown', 'key': 'a', 'code': 'KeyA', - 'modifiers': 2, # Ctrl modifier - 'windowsVirtualKeyCode': 65, + 'modifiers': select_all_modifier, }, session_id=cdp_session.session_id, ) @@ -654,8 +660,7 @@ class DefaultActionWatchdog(BaseWatchdog): 'type': 'keyUp', 'key': 'a', 'code': 'KeyA', - 'modifiers': 2, # Ctrl modifier - 'windowsVirtualKeyCode': 65, + 'modifiers': select_all_modifier, }, session_id=cdp_session.session_id, ) @@ -841,7 +846,7 @@ class DefaultActionWatchdog(BaseWatchdog): 'text': char, 'key': char, 'code': key_code, - 'windowsVirtualKeyCode': ord(char.upper()) if char.isalpha() else ord(char), + # Omit windowsVirtualKeyCode for printable chars to avoid wrong VK codes }, session_id=cdp_session.session_id, ) @@ -855,7 +860,7 @@ class DefaultActionWatchdog(BaseWatchdog): 'type': 'keyUp', 'key': char, 'code': key_code, - 'windowsVirtualKeyCode': ord(char.upper()) if char.isalpha() else ord(char), + # Omit windowsVirtualKeyCode for printable chars to avoid wrong VK codes }, session_id=cdp_session.session_id, ) From edbfcd2cdbc13395b276336e06455ed2ad528e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:50:10 -0700 Subject: [PATCH 41/69] Fix test --- tests/ci/test_browser_event_ClickElementEvent.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/test_browser_event_ClickElementEvent.py b/tests/ci/test_browser_event_ClickElementEvent.py index 435c0ac7b..08265ac98 100644 --- a/tests/ci/test_browser_event_ClickElementEvent.py +++ b/tests/ci/test_browser_event_ClickElementEvent.py @@ -260,7 +260,11 @@ class TestClickElementEvent: # Verify the result assert isinstance(result, ActionResult) - assert result.extracted_content is not None + result_text = result.extracted_content or result.long_term_memory + assert result_text is not None + assert f'Clicked element with index {link_index}' in result_text, ( + f'Expected click confirmation in result content, got: {result_text}' + ) # Verify that a new tab was opened tabs = await browser_session.get_tabs() From f4a34bc1d77ae1556191709932e63f571daf1ac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 19:24:47 -0700 Subject: [PATCH 42/69] fix-clear-text --- .../watchdogs/default_action_watchdog.py | 141 ++++++++++++++---- 1 file changed, 116 insertions(+), 25 deletions(-) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 8f75d7e21..e4de555a7 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -633,17 +633,121 @@ class DefaultActionWatchdog(BaseWatchdog): # Fallback for unknown characters return f'Key{char.upper()}' - async def _clear_text_field(self, object_id: str, cdp_session) -> None: - """Clear text field using human-like Ctrl+A + Backspace approach.""" + async def _clear_text_field(self, object_id: str, cdp_session) -> bool: + """Clear text field using multiple strategies, starting with the most reliable.""" + try: + # Strategy 1: Direct JavaScript value setting (most reliable for modern web apps) + self.logger.debug('🧹 Clearing text field using JavaScript value setting') + + await cdp_session.cdp_client.send.Runtime.callFunctionOn( + params={ + 'functionDeclaration': ''' + function() { + this.value = ""; + this.dispatchEvent(new Event("input", { bubbles: true })); + this.dispatchEvent(new Event("change", { bubbles: true })); + return this.value; + } + ''', + 'objectId': object_id, + 'returnByValue': True, + }, + session_id=cdp_session.session_id, + ) + + # Verify clearing worked by checking the value + verify_result = await cdp_session.cdp_client.send.Runtime.callFunctionOn( + params={ + 'functionDeclaration': 'function() { return this.value; }', + 'objectId': object_id, + 'returnByValue': True, + }, + session_id=cdp_session.session_id, + ) + + current_value = verify_result.get('result', {}).get('value', '') + if not current_value: + self.logger.debug('✅ Text field cleared successfully using JavaScript') + return True + else: + self.logger.debug(f'⚠️ JavaScript clear partially failed, field still contains: "{current_value}"') + + except Exception as e: + self.logger.debug(f'JavaScript clear failed: {e}') + + # Strategy 2: Triple-click + Delete (fallback for stubborn fields) + try: + self.logger.debug('🧹 Fallback: Clearing using triple-click + Delete') + + # Get element center coordinates for triple-click + bounds_result = await cdp_session.cdp_client.send.Runtime.callFunctionOn( + params={ + 'functionDeclaration': 'function() { return this.getBoundingClientRect(); }', + 'objectId': object_id, + 'returnByValue': True, + }, + session_id=cdp_session.session_id, + ) + + if bounds_result.get('result', {}).get('value'): + bounds = bounds_result['result']['value'] + center_x = bounds['x'] + bounds['width'] / 2 + center_y = bounds['y'] + bounds['height'] / 2 + + # Triple-click to select all text + await cdp_session.cdp_client.send.Input.dispatchMouseEvent( + params={ + 'type': 'mousePressed', + 'x': center_x, + 'y': center_y, + 'button': 'left', + 'clickCount': 3, + }, + session_id=cdp_session.session_id, + ) + await cdp_session.cdp_client.send.Input.dispatchMouseEvent( + params={ + 'type': 'mouseReleased', + 'x': center_x, + 'y': center_y, + 'button': 'left', + 'clickCount': 3, + }, + session_id=cdp_session.session_id, + ) + + # Delete selected text + await cdp_session.cdp_client.send.Input.dispatchKeyEvent( + params={ + 'type': 'keyDown', + 'key': 'Delete', + 'code': 'Delete', + }, + session_id=cdp_session.session_id, + ) + await cdp_session.cdp_client.send.Input.dispatchKeyEvent( + params={ + 'type': 'keyUp', + 'key': 'Delete', + 'code': 'Delete', + }, + session_id=cdp_session.session_id, + ) + + self.logger.debug('✅ Text field cleared using triple-click + Delete') + return True + + except Exception as e: + self.logger.debug(f'Triple-click clear failed: {e}') + + # Strategy 3: Keyboard shortcuts (last resort) try: - # Use Meta (Cmd) on macOS, Ctrl on other platforms import platform - is_macos = platform.system() == 'Darwin' select_all_modifier = 4 if is_macos else 2 # Meta=4 (Cmd), Ctrl=2 modifier_name = 'Cmd' if is_macos else 'Ctrl' - self.logger.debug(f'🧹 Clearing text field using {modifier_name}+A + Backspace') + self.logger.debug(f'🧹 Last resort: Clearing using {modifier_name}+A + Backspace') # Select all text (Ctrl/Cmd+A) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( @@ -665,16 +769,12 @@ class DefaultActionWatchdog(BaseWatchdog): session_id=cdp_session.session_id, ) - # Small delay - await asyncio.sleep(0.01) - # Delete selected text (Backspace) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ 'type': 'keyDown', 'key': 'Backspace', 'code': 'Backspace', - 'windowsVirtualKeyCode': 8, }, session_id=cdp_session.session_id, ) @@ -683,27 +783,16 @@ class DefaultActionWatchdog(BaseWatchdog): 'type': 'keyUp', 'key': 'Backspace', 'code': 'Backspace', - 'windowsVirtualKeyCode': 8, }, session_id=cdp_session.session_id, ) - self.logger.debug('✅ Text field cleared successfully') + self.logger.debug('✅ Text field cleared using keyboard shortcuts') + return True except Exception as e: - self.logger.debug(f'Failed to clear text field: {e}') - # Try JavaScript fallback - try: - await cdp_session.cdp_client.send.Runtime.callFunctionOn( - params={ - 'functionDeclaration': 'function() { if (this.value !== undefined) this.value = ""; }', - 'objectId': object_id, - }, - session_id=cdp_session.session_id, - ) - self.logger.debug('✅ Text field cleared using JavaScript fallback') - except Exception as js_e: - self.logger.debug(f'JavaScript clear also failed: {js_e}') + self.logger.debug(f'All clearing strategies failed: {e}') + return False async def _focus_element_simple( self, backend_node_id: int, object_id: str, cdp_session, input_coordinates: dict | None = None @@ -827,7 +916,9 @@ class DefaultActionWatchdog(BaseWatchdog): # Step 2: Clear existing text if requested if clear_existing and focused_successfully: - await self._clear_text_field(object_id=object_id, cdp_session=cdp_session) + cleared_successfully = await self._clear_text_field(object_id=object_id, cdp_session=cdp_session) + if not cleared_successfully: + self.logger.warning('⚠️ Text field clearing failed, typing may append to existing text') # Step 3: Type the text character by character using proper human-like key events # This emulates exactly how a human would type, which modern websites expect From 7ad12bbfdee1d1dab5c03a954530a5753ad42df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 19:49:34 -0700 Subject: [PATCH 43/69] fix-windowsVirtualKeyCode --- .../watchdogs/default_action_watchdog.py | 131 ++++++++++++++---- 1 file changed, 107 insertions(+), 24 deletions(-) diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index e4de555a7..166962a7b 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -582,6 +582,76 @@ class DefaultActionWatchdog(BaseWatchdog): except Exception as e: raise Exception(f'Failed to type to page: {str(e)}') + def _get_char_modifiers_and_vk(self, char: str) -> tuple[int, int, str]: + """Get modifiers, virtual key code, and base key for a character. + + Returns: + (modifiers, windowsVirtualKeyCode, base_key) + """ + # Characters that require Shift modifier + shift_chars = { + '!': ('1', 49), + '@': ('2', 50), + '#': ('3', 51), + '$': ('4', 52), + '%': ('5', 53), + '^': ('6', 54), + '&': ('7', 55), + '*': ('8', 56), + '(': ('9', 57), + ')': ('0', 48), + '_': ('-', 189), + '+': ('=', 187), + '{': ('[', 219), + '}': (']', 221), + '|': ('\\', 220), + ':': (';', 186), + '"': ("'", 222), + '<': (',', 188), + '>': ('.', 190), + '?': ('/', 191), + '~': ('`', 192), + } + + # Check if character requires Shift + if char in shift_chars: + base_key, vk_code = shift_chars[char] + return (8, vk_code, base_key) # Shift=8 + + # Uppercase letters require Shift + if char.isupper(): + return (8, ord(char), char.lower()) # Shift=8 + + # Lowercase letters + if char.islower(): + return (0, ord(char.upper()), char) + + # Numbers + if char.isdigit(): + return (0, ord(char), char) + + # Special characters without Shift + no_shift_chars = { + ' ': 32, + '-': 189, + '=': 187, + '[': 219, + ']': 221, + '\\': 220, + ';': 186, + "'": 222, + ',': 188, + '.': 190, + '/': 191, + '`': 192, + } + + if char in no_shift_chars: + return (0, no_shift_chars[char], char) + + # Fallback + return (0, ord(char.upper()) if char.isalpha() else ord(char), char) + def _get_key_code_for_char(self, char: str) -> str: """Get the proper key code for a character (like Playwright does).""" # Key code mapping for common characters (using proper base keys + modifiers) @@ -638,23 +708,23 @@ class DefaultActionWatchdog(BaseWatchdog): try: # Strategy 1: Direct JavaScript value setting (most reliable for modern web apps) self.logger.debug('🧹 Clearing text field using JavaScript value setting') - + await cdp_session.cdp_client.send.Runtime.callFunctionOn( params={ - 'functionDeclaration': ''' + 'functionDeclaration': """ function() { this.value = ""; this.dispatchEvent(new Event("input", { bubbles: true })); this.dispatchEvent(new Event("change", { bubbles: true })); return this.value; } - ''', + """, 'objectId': object_id, 'returnByValue': True, }, session_id=cdp_session.session_id, ) - + # Verify clearing worked by checking the value verify_result = await cdp_session.cdp_client.send.Runtime.callFunctionOn( params={ @@ -664,21 +734,21 @@ class DefaultActionWatchdog(BaseWatchdog): }, session_id=cdp_session.session_id, ) - + current_value = verify_result.get('result', {}).get('value', '') if not current_value: self.logger.debug('✅ Text field cleared successfully using JavaScript') return True else: self.logger.debug(f'⚠️ JavaScript clear partially failed, field still contains: "{current_value}"') - + except Exception as e: self.logger.debug(f'JavaScript clear failed: {e}') - + # Strategy 2: Triple-click + Delete (fallback for stubborn fields) try: self.logger.debug('🧹 Fallback: Clearing using triple-click + Delete') - + # Get element center coordinates for triple-click bounds_result = await cdp_session.cdp_client.send.Runtime.callFunctionOn( params={ @@ -688,12 +758,12 @@ class DefaultActionWatchdog(BaseWatchdog): }, session_id=cdp_session.session_id, ) - + if bounds_result.get('result', {}).get('value'): bounds = bounds_result['result']['value'] center_x = bounds['x'] + bounds['width'] / 2 center_y = bounds['y'] + bounds['height'] / 2 - + # Triple-click to select all text await cdp_session.cdp_client.send.Input.dispatchMouseEvent( params={ @@ -715,7 +785,7 @@ class DefaultActionWatchdog(BaseWatchdog): }, session_id=cdp_session.session_id, ) - + # Delete selected text await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ @@ -733,16 +803,17 @@ class DefaultActionWatchdog(BaseWatchdog): }, session_id=cdp_session.session_id, ) - + self.logger.debug('✅ Text field cleared using triple-click + Delete') return True - + except Exception as e: self.logger.debug(f'Triple-click clear failed: {e}') - + # Strategy 3: Keyboard shortcuts (last resort) try: import platform + is_macos = platform.system() == 'Darwin' select_all_modifier = 4 if is_macos else 2 # Meta=4 (Cmd), Ctrl=2 modifier_name = 'Cmd' if is_macos else 'Ctrl' @@ -925,19 +996,20 @@ class DefaultActionWatchdog(BaseWatchdog): self.logger.debug(f'🎯 Typing text character by character: "{text}"') for i, char in enumerate(text): - # Get proper key code for the character - key_code = self._get_key_code_for_char(char) + # Get proper modifiers, VK code, and base key for the character + modifiers, vk_code, base_key = self._get_char_modifiers_and_vk(char) + key_code = self._get_key_code_for_char(base_key) - # self.logger.debug(f'🎯 Typing character {i + 1}/{len(text)}: "{char}" (code: {key_code})') + # self.logger.debug(f'🎯 Typing character {i + 1}/{len(text)}: "{char}" (base_key: {base_key}, code: {key_code}, modifiers: {modifiers}, vk: {vk_code})') - # Send keyDown event (this is what humans do when pressing a key) + # Step 1: Send keyDown event (NO text parameter) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ 'type': 'keyDown', - 'text': char, - 'key': char, + 'key': base_key, 'code': key_code, - # Omit windowsVirtualKeyCode for printable chars to avoid wrong VK codes + 'modifiers': modifiers, + 'windowsVirtualKeyCode': vk_code, }, session_id=cdp_session.session_id, ) @@ -945,13 +1017,24 @@ class DefaultActionWatchdog(BaseWatchdog): # Small delay to emulate human typing speed await asyncio.sleep(0.001) - # Send keyUp event (this is what humans do when releasing a key) + # Step 2: Send char event (WITH text parameter) - this is crucial for text input + await cdp_session.cdp_client.send.Input.dispatchKeyEvent( + params={ + 'type': 'char', + 'text': char, + 'key': char, + }, + session_id=cdp_session.session_id, + ) + + # Step 3: Send keyUp event (NO text parameter) await cdp_session.cdp_client.send.Input.dispatchKeyEvent( params={ 'type': 'keyUp', - 'key': char, + 'key': base_key, 'code': key_code, - # Omit windowsVirtualKeyCode for printable chars to avoid wrong VK codes + 'modifiers': modifiers, + 'windowsVirtualKeyCode': vk_code, }, session_id=cdp_session.session_id, ) From 97add31aec7059a5dc8d8b16680fa74dc299c859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 19:55:21 -0700 Subject: [PATCH 44/69] fix test --- tests/ci/test_browser_event_ScrollEvent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ci/test_browser_event_ScrollEvent.py b/tests/ci/test_browser_event_ScrollEvent.py index b716969ca..88caa02f0 100644 --- a/tests/ci/test_browser_event_ScrollEvent.py +++ b/tests/ci/test_browser_event_ScrollEvent.py @@ -104,7 +104,6 @@ class TestScrollActions: assert result.extracted_content is not None assert 'Scrolled down' in result.extracted_content assert 'the page' in result.extracted_content - assert result.include_in_memory is True # Test 2: Basic page scroll up scroll_up_action = {'scroll': ScrollAction(down=False, num_pages=0.5)} From 634dd20e370795bf2d71b7a904e06cb566b3409f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:19:26 -0700 Subject: [PATCH 45/69] Add display highlights on screen feature and enhance bounding box drawing - Introduced `display_highlights_on_screen` option in `BrowserProfile` and `BrowserSession` to control the visibility of highlights directly on the browser screen. - Implemented `draw_enhanced_bounding_box_with_text` function to improve the visual representation of highlighted elements with larger indices and solid borders. - Updated screenshot overlay logic in `DOMWatchdog` to utilize the new highlighting feature, ensuring better visibility of interactive elements during browser sessions. --- browser_use/browser/profile.py | 3 + browser_use/browser/python_highlights.py | 136 +++++++++++++++--- browser_use/browser/session.py | 1 + browser_use/browser/watchdogs/dom_watchdog.py | 88 ++++++++++++ 4 files changed, 212 insertions(+), 16 deletions(-) diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index e29d910b0..c9170b8b6 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -583,6 +583,9 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro # --- UI/viewport/DOM --- highlight_elements: bool = Field(default=True, description='Highlight interactive elements on the page.') + display_highlights_on_screen: bool = Field( + default=True, description='Display highlights directly on the browser screen in addition to screenshots.' + ) # --- Downloads --- auto_download_pdfs: bool = Field( diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index a1d34f35b..688d2def8 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -52,6 +52,114 @@ def should_show_index_overlay(element_index: Optional[int]) -> bool: return element_index is not None +def draw_enhanced_bounding_box_with_text( + draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues + bbox: Tuple[int, int, int, int], + color: str, + text: Optional[str] = None, + font: Optional[ImageFont.FreeTypeFont] = None, + element_type: str = 'div', +) -> None: + """Draw an enhanced bounding box with bigger index and solid borders for better visibility.""" + x1, y1, x2, y2 = bbox + + # Draw solid bounding box (not dashed) with thicker lines for better visibility + line_width = 3 + + # Draw the main bounding box + draw.rectangle([x1, y1, x2, y2], outline=color, width=line_width) + + # Add a subtle inner highlight for better visibility + if x2 - x1 > 6 and y2 - y1 > 6: + draw.rectangle([x1 + 1, y1 + 1, x2 - 1, y2 - 1], outline=color, width=1) + + # Draw bigger index overlay if we have index text + if text: + try: + # Use bigger font size for index + big_font = None + try: + big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 16) + except (OSError, IOError): + try: + big_font = ImageFont.truetype('arial.ttf', 16) + except (OSError, IOError): + big_font = font # Fallback to original font + + # Get text size with bigger font + if big_font: + bbox_text = draw.textbbox((0, 0), text, font=big_font) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + else: + # Fallback for default font + bbox_text = draw.textbbox((0, 0), text) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + + # Bigger padding for more prominent index + padding = 8 + element_width = x2 - x1 + element_height = y2 - y1 + + # Always try to place inside the element first, then outside if too small + if element_width >= text_width + padding * 2 and element_height >= text_height + padding * 2: + # Place in top-left corner inside the element + text_x = x1 + padding + text_y = y1 + padding + else: + # Place outside above the element + text_x = x1 + text_y = max(0, y1 - text_height - padding) + + # Ensure text stays within image bounds + img_width = 1200 # Default assumption, could be passed as parameter + img_height = 800 + text_x = max(0, min(text_x, img_width - text_width - padding)) + text_y = max(0, min(text_y, img_height - text_height - padding)) + + # Draw bigger background rectangle with element-type-specific styling + bg_x1 = text_x - padding + bg_y1 = text_y - padding + bg_x2 = text_x + text_width + padding + bg_y2 = text_y + text_height + padding + + # Use element color as background with white text for high contrast + draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=2) + + # Draw white text on colored background for maximum visibility + draw.text((text_x, text_y), text, fill='white', font=big_font or font) + + # Add element type indicator if space allows + if element_width >= 60 and element_height >= 40: + type_text = element_type.upper()[:3] # Show first 3 chars of element type + try: + small_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 10) + except (OSError, IOError): + small_font = font + + if small_font: + type_bbox = draw.textbbox((0, 0), type_text, font=small_font) + type_width = type_bbox[2] - type_bbox[0] + type_height = type_bbox[3] - type_bbox[1] + + # Place type text in bottom-right corner + type_x = x2 - type_width - 4 + type_y = y2 - type_height - 4 + + # Small background for type text + draw.rectangle( + [type_x - 2, type_y - 1, type_x + type_width + 2, type_y + type_height + 1], + fill='rgba(0,0,0,128)', + outline=color, + width=1, + ) + draw.text((type_x, type_y), type_text, fill='white', font=small_font) + + except Exception as e: + logger.debug(f'Failed to draw enhanced text overlay: {e}') + + def draw_bounding_box_with_text( draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues bbox: Tuple[int, int, int, int], @@ -199,22 +307,18 @@ def create_highlighted_screenshot( # Process each interactive element for element_id, element in selector_map.items(): try: - # Use snapshot bounds (document coordinates) if available, otherwise absolute_position - bounds = None - if element.snapshot_node and element.snapshot_node.bounds: - bounds = element.snapshot_node.bounds - elif element.absolute_position: - bounds = element.absolute_position - - if not bounds: + # Use absolute_position coordinates directly + if not element.absolute_position: continue - # Convert from CSS pixels to device pixels for screenshot coordinates - # Note: bounds are already in CSS pixels, screenshot is in device pixels - x1 = int((bounds.x - viewport_offset_x) * device_pixel_ratio) - y1 = int((bounds.y - viewport_offset_y) * device_pixel_ratio) - x2 = int((bounds.x + bounds.width - viewport_offset_x) * device_pixel_ratio) - y2 = int((bounds.y + bounds.height - viewport_offset_y) * device_pixel_ratio) + bounds = element.absolute_position + + # Scale coordinates from CSS pixels to device pixels for screenshot + # The screenshot is captured at device pixel resolution, but coordinates are in CSS pixels + x1 = int(bounds.x * device_pixel_ratio) + y1 = int(bounds.y * device_pixel_ratio) + x2 = int((bounds.x + bounds.width) * device_pixel_ratio) + y2 = int((bounds.y + bounds.height) * device_pixel_ratio) # Ensure coordinates are within image bounds img_width, img_height = image.size @@ -239,8 +343,8 @@ def create_highlighted_screenshot( element_index = getattr(element, 'element_index', None) index_text = str(element_index) if element_index is not None else None - # Draw bounding box with index - draw_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font) + # Draw enhanced bounding box with bigger index + draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name) except Exception as e: logger.debug(f'Failed to draw highlight for element {element_id}: {e}') diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index b4b01f62c..910324cf9 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -265,6 +265,7 @@ class BrowserSession(BaseModel): wait_for_network_idle_page_load_time: float | None = None, wait_between_actions: float | None = None, highlight_elements: bool | None = None, + display_highlights_on_screen: bool | None = None, auto_download_pdfs: bool | None = None, profile_directory: str | None = None, ): diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index 0db217d8b..72dee72cc 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -239,6 +239,94 @@ class DOMWatchdog(BaseWatchdog): except Exception as e: self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}') + # Display highlighted screenshot overlay on screen if requested + if ( + screenshot_b64 + and content + and content.selector_map + and self.browser_session.browser_profile.display_highlights_on_screen + ): + try: + self.logger.debug( + '🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🖥️ Displaying highlighted screenshot overlay...' + ) + + # Use the existing screenshot (which has correct scaling) and add highlights to it + # Get CDP session for viewport info + cdp_session = await self.browser_session.get_or_create_cdp_session() + + # Create highlighted version of the existing screenshot + from browser_use.browser.python_highlights import create_highlighted_screenshot_async + + highlighted_screenshot_b64 = await create_highlighted_screenshot_async( + screenshot_b64, content.selector_map, cdp_session + ) + + # Get viewport dimensions and device pixel ratio for proper scaling + viewport_info = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id) + visual_viewport = viewport_info.get('visualViewport', {}) + css_visual_viewport = viewport_info.get('cssVisualViewport', {}) + + # Get CSS pixel dimensions (what the browser shows) + css_width = css_visual_viewport.get('clientWidth', 1280) + css_height = css_visual_viewport.get('clientHeight', 720) + + # Get device pixel dimensions (what the screenshot is) + device_width = visual_viewport.get('clientWidth', css_width) + device_height = visual_viewport.get('clientHeight', css_height) + + # Calculate device pixel ratio + device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0 + + # Inject the highlighted screenshot as an overlay with correct DPR scaling + overlay_script = f""" + (function() {{ + // Remove any existing browser-use overlay + const existingOverlay = document.getElementById('browser-use-highlight-overlay'); + if (existingOverlay) {{ + existingOverlay.remove(); + }} + + // Get actual viewport dimensions in CSS pixels + const cssWidth = window.innerWidth; + const cssHeight = window.innerHeight; + const devicePixelRatio = window.devicePixelRatio || 1; + + // Create overlay container + const overlay = document.createElement('div'); + overlay.id = 'browser-use-highlight-overlay'; + overlay.style.cssText = ` + position: fixed; + top: 0; + left: 0; + width: ${{cssWidth}}px; + height: ${{cssHeight}}px; + pointer-events: none; + z-index: 2147483647; + opacity: 0.4; + background-image: url(data:image/png;base64,{highlighted_screenshot_b64}); + background-size: ${{cssWidth}}px ${{cssHeight}}px; + background-position: top left; + background-repeat: no-repeat; + `; + + document.body.appendChild(overlay); + console.log('Browser-use highlighted screenshot overlay injected with', '{len(content.selector_map)}', 'elements'); + console.log('CSS viewport:', cssWidth, 'x', cssHeight, 'DPR:', devicePixelRatio); + }})(); + """ + + # Execute the overlay script + await cdp_session.cdp_client.send.Runtime.evaluate( + params={'expression': overlay_script, 'returnByValue': True}, session_id=cdp_session.session_id + ) + + self.logger.debug( + f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Displayed highlighted screenshot overlay with {len(content.selector_map)} elements' + ) + except Exception as e: + self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Screenshot overlay failed: {e}') + # Ensure we have valid content if not content: content = SerializedDOMState(_root=None, selector_map={}) From 3b6c5d11ba361ba5d8e108655393e9956919491c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:24:13 -0700 Subject: [PATCH 46/69] Enhance bounding box drawing with dashed borders and larger index containers - Updated `draw_enhanced_bounding_box_with_text` function to draw dashed bounding boxes for better visibility. - Increased the size of index text and padding for improved prominence. - Adjusted text positioning logic to ensure proper placement within or outside the bounding box based on available space. - Enhanced background rectangle drawing for better contrast and visibility. --- browser_use/browser/python_highlights.py | 114 +++++++++++------------ 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 688d2def8..5644f1eac 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -60,35 +60,56 @@ def draw_enhanced_bounding_box_with_text( font: Optional[ImageFont.FreeTypeFont] = None, element_type: str = 'div', ) -> None: - """Draw an enhanced bounding box with bigger index and solid borders for better visibility.""" + """Draw an enhanced bounding box with much bigger index containers and dashed borders.""" x1, y1, x2, y2 = bbox - # Draw solid bounding box (not dashed) with thicker lines for better visibility - line_width = 3 + # Draw dashed bounding box with pattern: 1 line, 2 spaces, 1 line, 2 spaces... + dash_length = 4 + gap_length = 8 + line_width = 2 - # Draw the main bounding box - draw.rectangle([x1, y1, x2, y2], outline=color, width=line_width) + # Helper function to draw dashed line + def draw_dashed_line(start_x, start_y, end_x, end_y): + if start_x == end_x: # Vertical line + y = start_y + while y < end_y: + dash_end = min(y + dash_length, end_y) + draw.line([(start_x, y), (start_x, dash_end)], fill=color, width=line_width) + y += dash_length + gap_length + else: # Horizontal line + x = start_x + while x < end_x: + dash_end = min(x + dash_length, end_x) + draw.line([(x, start_y), (dash_end, start_y)], fill=color, width=line_width) + x += dash_length + gap_length - # Add a subtle inner highlight for better visibility - if x2 - x1 > 6 and y2 - y1 > 6: - draw.rectangle([x1 + 1, y1 + 1, x2 - 1, y2 - 1], outline=color, width=1) + # Draw dashed rectangle + draw_dashed_line(x1, y1, x2, y1) # Top + draw_dashed_line(x2, y1, x2, y2) # Right + draw_dashed_line(x2, y2, x1, y2) # Bottom + draw_dashed_line(x1, y2, x1, y1) # Left - # Draw bigger index overlay if we have index text + # Draw much bigger index overlay if we have index text if text: try: - # Use bigger font size for index - big_font = None + # Use much bigger font size for index (5x bigger base) + huge_font = None + font_size = 32 # Much bigger than the original 16 try: - big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 16) + huge_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) except (OSError, IOError): try: - big_font = ImageFont.truetype('arial.ttf', 16) + huge_font = ImageFont.truetype('arial.ttf', font_size) except (OSError, IOError): - big_font = font # Fallback to original font + # Try system fonts on different platforms + try: + huge_font = ImageFont.truetype('Arial Bold.ttf', font_size) + except (OSError, IOError): + huge_font = font # Fallback to original font - # Get text size with bigger font - if big_font: - bbox_text = draw.textbbox((0, 0), text, font=big_font) + # Get text size with much bigger font + if huge_font: + bbox_text = draw.textbbox((0, 0), text, font=huge_font) text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] else: @@ -97,64 +118,41 @@ def draw_enhanced_bounding_box_with_text( text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] - # Bigger padding for more prominent index - padding = 8 + # Much bigger padding (5x bigger) + padding = 20 element_width = x2 - x1 element_height = y2 - y1 - # Always try to place inside the element first, then outside if too small - if element_width >= text_width + padding * 2 and element_height >= text_height + padding * 2: - # Place in top-left corner inside the element + # Simple positioning logic: always top-left + # Inside if element is big enough, outside if too small + min_container_width = text_width + padding * 2 + min_container_height = text_height + padding * 2 + + if element_width >= min_container_width and element_height >= min_container_height: + # Place inside top-left corner text_x = x1 + padding text_y = y1 + padding else: - # Place outside above the element + # Place outside top-left corner text_x = x1 - text_y = max(0, y1 - text_height - padding) + text_y = max(0, y1 - min_container_height) - # Ensure text stays within image bounds - img_width = 1200 # Default assumption, could be passed as parameter - img_height = 800 - text_x = max(0, min(text_x, img_width - text_width - padding)) - text_y = max(0, min(text_y, img_height - text_height - padding)) + # Ensure text stays within image bounds (use actual image size if available) + img_width, img_height = draw.im.size if hasattr(draw, 'im') else (2000, 1500) # Larger default + text_x = max(0, min(text_x, img_width - min_container_width)) + text_y = max(0, min(text_y, img_height - min_container_height)) - # Draw bigger background rectangle with element-type-specific styling + # Draw much bigger background rectangle (5x bigger) bg_x1 = text_x - padding bg_y1 = text_y - padding bg_x2 = text_x + text_width + padding bg_y2 = text_y + text_height + padding # Use element color as background with white text for high contrast - draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=2) + draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=3) # Draw white text on colored background for maximum visibility - draw.text((text_x, text_y), text, fill='white', font=big_font or font) - - # Add element type indicator if space allows - if element_width >= 60 and element_height >= 40: - type_text = element_type.upper()[:3] # Show first 3 chars of element type - try: - small_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 10) - except (OSError, IOError): - small_font = font - - if small_font: - type_bbox = draw.textbbox((0, 0), type_text, font=small_font) - type_width = type_bbox[2] - type_bbox[0] - type_height = type_bbox[3] - type_bbox[1] - - # Place type text in bottom-right corner - type_x = x2 - type_width - 4 - type_y = y2 - type_height - 4 - - # Small background for type text - draw.rectangle( - [type_x - 2, type_y - 1, type_x + type_width + 2, type_y + type_height + 1], - fill='rgba(0,0,0,128)', - outline=color, - width=1, - ) - draw.text((type_x, type_y), type_text, fill='white', font=small_font) + draw.text((text_x, text_y), text, fill='white', font=huge_font or font) except Exception as e: logger.debug(f'Failed to draw enhanced text overlay: {e}') From 8511c2d3980eb7faef4244ea80ba8db9666dd4b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:38:57 -0700 Subject: [PATCH 47/69] Remove display highlights on screen feature and adjust bounding box drawing parameters - Eliminated the `display_highlights_on_screen` option from `BrowserProfile` and `BrowserSession` to streamline the highlighting functionality. - Updated the `draw_enhanced_bounding_box_with_text` function to increase font size for better visibility and reduced padding for a more compact layout. - Removed associated screenshot overlay logic in `DOMWatchdog` to reflect the removal of the display highlights feature. --- browser_use/browser/profile.py | 3 - browser_use/browser/python_highlights.py | 6 +- browser_use/browser/session.py | 1 - browser_use/browser/watchdogs/dom_watchdog.py | 88 ------------------- 4 files changed, 3 insertions(+), 95 deletions(-) diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index c9170b8b6..e29d910b0 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -583,9 +583,6 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro # --- UI/viewport/DOM --- highlight_elements: bool = Field(default=True, description='Highlight interactive elements on the page.') - display_highlights_on_screen: bool = Field( - default=True, description='Display highlights directly on the browser screen in addition to screenshots.' - ) # --- Downloads --- auto_download_pdfs: bool = Field( diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 5644f1eac..cf958e8f4 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -94,7 +94,7 @@ def draw_enhanced_bounding_box_with_text( try: # Use much bigger font size for index (5x bigger base) huge_font = None - font_size = 32 # Much bigger than the original 16 + font_size = 35 # Much bigger than the original 16 try: huge_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) except (OSError, IOError): @@ -118,8 +118,8 @@ def draw_enhanced_bounding_box_with_text( text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] - # Much bigger padding (5x bigger) - padding = 20 + # No padding - container fits exactly around the number + padding = 10 element_width = x2 - x1 element_height = y2 - y1 diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 910324cf9..b4b01f62c 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -265,7 +265,6 @@ class BrowserSession(BaseModel): wait_for_network_idle_page_load_time: float | None = None, wait_between_actions: float | None = None, highlight_elements: bool | None = None, - display_highlights_on_screen: bool | None = None, auto_download_pdfs: bool | None = None, profile_directory: str | None = None, ): diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index 72dee72cc..0db217d8b 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -239,94 +239,6 @@ class DOMWatchdog(BaseWatchdog): except Exception as e: self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}') - # Display highlighted screenshot overlay on screen if requested - if ( - screenshot_b64 - and content - and content.selector_map - and self.browser_session.browser_profile.display_highlights_on_screen - ): - try: - self.logger.debug( - '🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🖥️ Displaying highlighted screenshot overlay...' - ) - - # Use the existing screenshot (which has correct scaling) and add highlights to it - # Get CDP session for viewport info - cdp_session = await self.browser_session.get_or_create_cdp_session() - - # Create highlighted version of the existing screenshot - from browser_use.browser.python_highlights import create_highlighted_screenshot_async - - highlighted_screenshot_b64 = await create_highlighted_screenshot_async( - screenshot_b64, content.selector_map, cdp_session - ) - - # Get viewport dimensions and device pixel ratio for proper scaling - viewport_info = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id) - visual_viewport = viewport_info.get('visualViewport', {}) - css_visual_viewport = viewport_info.get('cssVisualViewport', {}) - - # Get CSS pixel dimensions (what the browser shows) - css_width = css_visual_viewport.get('clientWidth', 1280) - css_height = css_visual_viewport.get('clientHeight', 720) - - # Get device pixel dimensions (what the screenshot is) - device_width = visual_viewport.get('clientWidth', css_width) - device_height = visual_viewport.get('clientHeight', css_height) - - # Calculate device pixel ratio - device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0 - - # Inject the highlighted screenshot as an overlay with correct DPR scaling - overlay_script = f""" - (function() {{ - // Remove any existing browser-use overlay - const existingOverlay = document.getElementById('browser-use-highlight-overlay'); - if (existingOverlay) {{ - existingOverlay.remove(); - }} - - // Get actual viewport dimensions in CSS pixels - const cssWidth = window.innerWidth; - const cssHeight = window.innerHeight; - const devicePixelRatio = window.devicePixelRatio || 1; - - // Create overlay container - const overlay = document.createElement('div'); - overlay.id = 'browser-use-highlight-overlay'; - overlay.style.cssText = ` - position: fixed; - top: 0; - left: 0; - width: ${{cssWidth}}px; - height: ${{cssHeight}}px; - pointer-events: none; - z-index: 2147483647; - opacity: 0.4; - background-image: url(data:image/png;base64,{highlighted_screenshot_b64}); - background-size: ${{cssWidth}}px ${{cssHeight}}px; - background-position: top left; - background-repeat: no-repeat; - `; - - document.body.appendChild(overlay); - console.log('Browser-use highlighted screenshot overlay injected with', '{len(content.selector_map)}', 'elements'); - console.log('CSS viewport:', cssWidth, 'x', cssHeight, 'DPR:', devicePixelRatio); - }})(); - """ - - # Execute the overlay script - await cdp_session.cdp_client.send.Runtime.evaluate( - params={'expression': overlay_script, 'returnByValue': True}, session_id=cdp_session.session_id - ) - - self.logger.debug( - f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Displayed highlighted screenshot overlay with {len(content.selector_map)} elements' - ) - except Exception as e: - self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Screenshot overlay failed: {e}') - # Ensure we have valid content if not content: content = SerializedDOMState(_root=None, selector_map={}) From 66822d60906c4db7073b120e0c0b7818f58789ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:45:54 -0700 Subject: [PATCH 48/69] Update font size and padding in bounding box drawing functions for improved visibility - Increased font size in `draw_enhanced_bounding_box_with_text` from 35 to 40 for better readability. - Adjusted padding in `draw_bounding_box_with_text` from 3 to 5 to enhance spacing around text elements. --- browser_use/browser/python_highlights.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index cf958e8f4..9a91445f5 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -94,7 +94,7 @@ def draw_enhanced_bounding_box_with_text( try: # Use much bigger font size for index (5x bigger base) huge_font = None - font_size = 35 # Much bigger than the original 16 + font_size = 40 # Much bigger than the original 16 try: huge_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) except (OSError, IOError): @@ -219,7 +219,7 @@ def draw_bounding_box_with_text( text_height = bbox_text[3] - bbox_text[1] # Smart positioning based on element size - padding = 3 + padding = 5 element_width = x2 - x1 element_height = y2 - y1 element_area = element_width * element_height From 0913a8a345744fcaa78f2ef7fa96be24232093aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:51:39 -0700 Subject: [PATCH 49/69] fixed `FRAME` elements --- browser_use/dom/serializer/serializer.py | 6 +++--- browser_use/dom/views.py | 20 +++++++++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/browser_use/dom/serializer/serializer.py b/browser_use/dom/serializer/serializer.py index 26ef9b62a..d660a648b 100644 --- a/browser_use/dom/serializer/serializer.py +++ b/browser_use/dom/serializer/serializer.py @@ -143,10 +143,10 @@ class DOMTreeSerializer: if node.node_name.lower() in DISABLED_ELEMENTS: return None - if node.node_name == 'IFRAME': + if node.node_name == 'IFRAME' or node.node_name == 'FRAME': if node.content_document: simplified = SimplifiedNode(original_node=node, children=[]) - for child in node.content_document.children: + for child in node.content_document.children_nodes or []: simplified_child = self._create_simplified_tree(child) if simplified_child: simplified.children.append(simplified_child) @@ -159,7 +159,7 @@ class DOMTreeSerializer: is_scrollable = node.is_actually_scrollable # Include if interactive (regardless of visibility), or scrollable, or has children to process - should_include = (is_interactive and is_visible) or is_scrollable or node.children_and_shadow_roots + should_include = (is_interactive and is_visible) or is_scrollable or bool(node.children_and_shadow_roots) if should_include: simplified = SimplifiedNode(original_node=node, children=[]) diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py index 0f182888f..e07918e7f 100644 --- a/browser_use/dom/views.py +++ b/browser_use/dom/views.py @@ -91,14 +91,28 @@ class SimplifiedNode: is_new: bool = False excluded_by_parent: bool = False # New field for bbox filtering + def _clean_original_node_json(self, node_json: dict) -> dict: + """Recursively remove children_nodes and shadow_roots from original_node JSON.""" + # Remove the fields we don't want in SimplifiedNode serialization + if 'children_nodes' in node_json: + del node_json['children_nodes'] + if 'shadow_roots' in node_json: + del node_json['shadow_roots'] + + # Clean nested content_document if it exists + if node_json.get('content_document'): + node_json['content_document'] = self._clean_original_node_json(node_json['content_document']) + + return node_json + def __json__(self) -> dict: original_node_json = self.original_node.__json__() - del original_node_json['children_nodes'] - del original_node_json['shadow_roots'] + # Remove children_nodes and shadow_roots to avoid duplication with SimplifiedNode.children + cleaned_original_node_json = self._clean_original_node_json(original_node_json) return { 'should_display': self.should_display, 'interactive_index': self.interactive_index, - 'original_node': original_node_json, + 'original_node': cleaned_original_node_json, 'children': [c.__json__() for c in self.children], } From e8bd85865d907e8800a98026aa25baef409c0955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:56:03 -0700 Subject: [PATCH 50/69] Add filter for highlight IDs in screenshot generation and update element colors - Introduced `filter_highlight_ids` option in `BrowserProfile`, `BrowserSession`, and `create_highlighted_screenshot` to control the visibility of element IDs based on text length. - Updated `draw_enhanced_bounding_box_with_text` to utilize the new filtering logic for enhanced visual clarity. - Changed the color of text areas from yellow to orange for improved visibility in the UI. --- browser_use/browser/profile.py | 3 ++ browser_use/browser/python_highlights.py | 32 ++++++++++++++----- browser_use/browser/session.py | 1 + browser_use/browser/watchdogs/dom_watchdog.py | 7 +++- 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index e29d910b0..86e4eada5 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -583,6 +583,9 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro # --- UI/viewport/DOM --- highlight_elements: bool = Field(default=True, description='Highlight interactive elements on the page.') + filter_highlight_ids: bool = Field( + default=True, description='Only show element IDs in highlights if llm_representation is less than 10 characters.' + ) # --- Downloads --- auto_download_pdfs: bool = Field( diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 9a91445f5..6a64540e2 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -22,7 +22,7 @@ ELEMENT_COLORS = { 'input': '#4ECDC4', # Teal for inputs 'select': '#45B7D1', # Blue for dropdowns 'a': '#96CEB4', # Green for links - 'textarea': '#FFEAA7', # Yellow for text areas + 'textarea': '#FF8C42', # Orange for text areas (was yellow, now more visible) 'default': '#DDA0DD', # Light purple for other interactive elements } @@ -59,6 +59,7 @@ def draw_enhanced_bounding_box_with_text( text: Optional[str] = None, font: Optional[ImageFont.FreeTypeFont] = None, element_type: str = 'div', + image_size: Tuple[int, int] = (2000, 1500), ) -> None: """Draw an enhanced bounding box with much bigger index containers and dashed borders.""" x1, y1, x2, y2 = bbox @@ -137,8 +138,8 @@ def draw_enhanced_bounding_box_with_text( text_x = x1 text_y = max(0, y1 - min_container_height) - # Ensure text stays within image bounds (use actual image size if available) - img_width, img_height = draw.im.size if hasattr(draw, 'im') else (2000, 1500) # Larger default + # Ensure text stays within image bounds using actual image dimensions + img_width, img_height = image_size text_x = max(0, min(text_x, img_width - min_container_width)) text_y = max(0, min(text_y, img_height - min_container_height)) @@ -271,6 +272,7 @@ def create_highlighted_screenshot( device_pixel_ratio: float = 1.0, viewport_offset_x: int = 0, viewport_offset_y: int = 0, + filter_highlight_ids: bool = True, ) -> str: """Create a highlighted screenshot with bounding boxes around interactive elements. @@ -337,12 +339,22 @@ def create_highlighted_screenshot( color = get_element_color(tag_name, element_type) - # Get element index for overlay + # Get element index for overlay and apply filtering element_index = getattr(element, 'element_index', None) - index_text = str(element_index) if element_index is not None else None + index_text = None + + if element_index is not None: + if filter_highlight_ids: + # Only show ID if llm_representation is less than 10 characters (elements with little text need visual ID) + element_text = element.get_all_children_text() + if len(element_text) < 5: + index_text = str(element_index) + else: + # Always show ID when filter is disabled + index_text = str(element_index) # Draw enhanced bounding box with bigger index - draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name) + draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name, image.size) except Exception as e: logger.debug(f'Failed to draw highlight for element {element_id}: {e}') @@ -396,7 +408,9 @@ async def get_viewport_info_from_cdp(cdp_session) -> Tuple[float, int, int]: @observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot_async') -async def create_highlighted_screenshot_async(screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None) -> str: +async def create_highlighted_screenshot_async( + screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None, filter_highlight_ids: bool = True +) -> str: """Async wrapper for creating highlighted screenshots. Args: @@ -419,4 +433,6 @@ async def create_highlighted_screenshot_async(screenshot_b64: str, selector_map: logger.debug(f'Failed to get viewport info from CDP: {e}') # Create highlighted screenshot (run in thread pool if needed for performance) - return create_highlighted_screenshot(screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y) + return create_highlighted_screenshot( + screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y, filter_highlight_ids + ) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index b4b01f62c..b6ccb06f1 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -265,6 +265,7 @@ class BrowserSession(BaseModel): wait_for_network_idle_page_load_time: float | None = None, wait_between_actions: float | None = None, highlight_elements: bool | None = None, + filter_highlight_ids: bool | None = None, auto_download_pdfs: bool | None = None, profile_directory: str | None = None, ): diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index 0db217d8b..2558dc777 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -232,7 +232,12 @@ class DOMWatchdog(BaseWatchdog): # Get CDP session for viewport info cdp_session = await self.browser_session.get_or_create_cdp_session() - screenshot_b64 = await create_highlighted_screenshot_async(screenshot_b64, content.selector_map, cdp_session) + screenshot_b64 = await create_highlighted_screenshot_async( + screenshot_b64, + content.selector_map, + cdp_session, + self.browser_session.browser_profile.filter_highlight_ids, + ) self.logger.debug( f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements' ) From 3d8e00f0b02e0b147cf0761398d5beee47866a49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20=C5=BDuni=C4=8D?= <36313686+gregpr07@users.noreply.github.com> Date: Fri, 29 Aug 2025 21:04:50 -0700 Subject: [PATCH 51/69] added FRAME to everywhere where we have IFRAME --- browser_use/dom/serializer/clickable_elements.py | 2 +- browser_use/dom/serializer/serializer.py | 10 +++++++++- browser_use/dom/service.py | 8 ++++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/browser_use/dom/serializer/clickable_elements.py b/browser_use/dom/serializer/clickable_elements.py index 807bbcc01..e1d514629 100644 --- a/browser_use/dom/serializer/clickable_elements.py +++ b/browser_use/dom/serializer/clickable_elements.py @@ -20,7 +20,7 @@ class ClickableElementDetector: # IFRAME elements should be interactive if they're large enough to potentially need scrolling # Small iframes (< 100px width or height) are unlikely to have scrollable content - if node.tag_name and node.tag_name.upper() == 'IFRAME': + if node.tag_name and node.tag_name.upper() == 'IFRAME' or node.tag_name.upper() == 'FRAME': if node.snapshot_node and node.snapshot_node.bounds: width = node.snapshot_node.bounds.width height = node.snapshot_node.bounds.height diff --git a/browser_use/dom/serializer/serializer.py b/browser_use/dom/serializer/serializer.py index d660a648b..ec74e0702 100644 --- a/browser_use/dom/serializer/serializer.py +++ b/browser_use/dom/serializer/serializer.py @@ -435,7 +435,12 @@ class DOMTreeSerializer: # Add element with interactive_index if clickable, scrollable, or iframe is_any_scrollable = node.original_node.is_actually_scrollable or node.original_node.is_scrollable should_show_scroll = node.original_node.should_show_scroll_info - if node.interactive_index is not None or is_any_scrollable or node.original_node.tag_name.upper() == 'IFRAME': + if ( + node.interactive_index is not None + or is_any_scrollable + or node.original_node.tag_name.upper() == 'IFRAME' + or node.original_node.tag_name.upper() == 'FRAME' + ): next_depth += 1 # Build attributes string @@ -453,6 +458,9 @@ class DOMTreeSerializer: elif node.original_node.tag_name.upper() == 'IFRAME': # Iframe element (not interactive) line = f'{depth_str}|IFRAME|<{node.original_node.tag_name}' + elif node.original_node.tag_name.upper() == 'FRAME': + # Frame element (not interactive) + line = f'{depth_str}|FRAME|<{node.original_node.tag_name}' else: line = f'{depth_str}<{node.original_node.tag_name}' diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py index 9076ece75..155c2d498 100644 --- a/browser_use/dom/service.py +++ b/browser_use/dom/service.py @@ -187,7 +187,7 @@ class DomService: for frame in reversed(html_frames): if ( frame.node_type == NodeType.ELEMENT_NODE - and frame.node_name.upper() == 'IFRAME' + and (frame.node_name.upper() == 'IFRAME' or frame.node_name.upper() == 'FRAME') and frame.snapshot_node and frame.snapshot_node.bounds ): @@ -561,7 +561,11 @@ class DomService: ) # Calculate new iframe offset for content documents, accounting for iframe scroll - if node['nodeName'].upper() == 'IFRAME' and snapshot_data and snapshot_data.bounds: + if ( + (node['nodeName'].upper() == 'IFRAME' or node['nodeName'].upper() == 'FRAME') + and snapshot_data + and snapshot_data.bounds + ): if snapshot_data.bounds: updated_html_frames.append(dom_tree_node) From a31ae9dffb835078bfde5bdfeceea1273c277a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 21:30:11 -0700 Subject: [PATCH 52/69] Refactor bounding box drawing and enhance LLM text representation - Removed the `_raise_if_stopped_or_paused` method from the `Agent` class to streamline functionality. - Updated `draw_enhanced_bounding_box_with_text` to improve font size and padding for better visibility of index boxes. - Introduced `get_meaningful_text_for_llm` method in `EnhancedDOMTreeNode` to provide more relevant text for LLM processing. - Enhanced `llm_representation` method with observability features for better debugging. --- browser_use/agent/service.py | 1 - browser_use/browser/python_highlights.py | 94 ++++++++++++++---------- browser_use/dom/views.py | 21 ++++++ 3 files changed, 78 insertions(+), 38 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index c75663c58..98b01e8c3 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -599,7 +599,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): if hasattr(self, 'cloud_sync') and self.cloud_sync and self.enable_cloud_sync: self.eventbus.on('*', self.cloud_sync.handle_event) - @observe_debug(ignore_input=True, ignore_output=True, name='_raise_if_stopped_or_paused') async def _raise_if_stopped_or_paused(self) -> None: """Utility function that raises an InterruptedError if the agent is stopped or paused.""" diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 6a64540e2..d22dc303d 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -93,24 +93,24 @@ def draw_enhanced_bounding_box_with_text( # Draw much bigger index overlay if we have index text if text: try: - # Use much bigger font size for index (5x bigger base) - huge_font = None - font_size = 40 # Much bigger than the original 16 + # Use much bigger font size for visible index boxes + big_font = None + font_size = 30 # Much bigger, more visible size try: - huge_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) + big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) except (OSError, IOError): try: - huge_font = ImageFont.truetype('arial.ttf', font_size) + big_font = ImageFont.truetype('arial.ttf', font_size) except (OSError, IOError): # Try system fonts on different platforms try: - huge_font = ImageFont.truetype('Arial Bold.ttf', font_size) + big_font = ImageFont.truetype('Arial Bold.ttf', font_size) except (OSError, IOError): - huge_font = font # Fallback to original font + big_font = font # Fallback to original font - # Get text size with much bigger font - if huge_font: - bbox_text = draw.textbbox((0, 0), text, font=huge_font) + # Get text size with bigger font + if big_font: + bbox_text = draw.textbbox((0, 0), text, font=big_font) text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] else: @@ -119,41 +119,60 @@ def draw_enhanced_bounding_box_with_text( text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] - # No padding - container fits exactly around the number - padding = 10 + # Bigger padding for more prominent index boxes + padding = 8 element_width = x2 - x1 element_height = y2 - y1 - # Simple positioning logic: always top-left - # Inside if element is big enough, outside if too small - min_container_width = text_width + padding * 2 - min_container_height = text_height + padding * 2 + # Container dimensions + container_width = text_width + padding * 2 + container_height = text_height + padding * 2 - if element_width >= min_container_width and element_height >= min_container_height: + # Position in top-left corner (inside if fits, outside if too small) + if element_width >= container_width and element_height >= container_height: # Place inside top-left corner - text_x = x1 + padding - text_y = y1 + padding + bg_x1 = x1 + 2 # Small offset from edge + bg_y1 = y1 + 2 else: # Place outside top-left corner - text_x = x1 - text_y = max(0, y1 - min_container_height) + bg_x1 = x1 + bg_y1 = max(0, y1 - container_height) - # Ensure text stays within image bounds using actual image dimensions + bg_x2 = bg_x1 + container_width + bg_y2 = bg_y1 + container_height + + # Center the number within the index box + text_x = bg_x1 + (container_width - text_width) // 2 + text_y = bg_y1 + (container_height - text_height) // 2 + + # Ensure container stays within image bounds img_width, img_height = image_size - text_x = max(0, min(text_x, img_width - min_container_width)) - text_y = max(0, min(text_y, img_height - min_container_height)) + if bg_x1 < 0: + offset = -bg_x1 + bg_x1 += offset + bg_x2 += offset + text_x += offset + if bg_y1 < 0: + offset = -bg_y1 + bg_y1 += offset + bg_y2 += offset + text_y += offset + if bg_x2 > img_width: + offset = bg_x2 - img_width + bg_x1 -= offset + bg_x2 -= offset + text_x -= offset + if bg_y2 > img_height: + offset = bg_y2 - img_height + bg_y1 -= offset + bg_y2 -= offset + text_y -= offset - # Draw much bigger background rectangle (5x bigger) - bg_x1 = text_x - padding - bg_y1 = text_y - padding - bg_x2 = text_x + text_width + padding - bg_y2 = text_y + text_height + padding + # Draw bigger background rectangle with thicker border + draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=2) - # Use element color as background with white text for high contrast - draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=3) - - # Draw white text on colored background for maximum visibility - draw.text((text_x, text_y), text, fill='white', font=huge_font or font) + # Draw white text centered in the index box + draw.text((text_x, text_y), text, fill='white', font=big_font or font) except Exception as e: logger.debug(f'Failed to draw enhanced text overlay: {e}') @@ -345,9 +364,10 @@ def create_highlighted_screenshot( if element_index is not None: if filter_highlight_ids: - # Only show ID if llm_representation is less than 10 characters (elements with little text need visual ID) - element_text = element.get_all_children_text() - if len(element_text) < 5: + # Use the meaningful text that matches what the LLM sees + meaningful_text = element.get_meaningful_text_for_llm() + # Show ID only if meaningful text is less than 5 characters + if len(meaningful_text) < 5: index_text = str(element_index) else: # Always show ID when filter is disabled diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py index 0f182888f..92d3eaea1 100644 --- a/browser_use/dom/views.py +++ b/browser_use/dom/views.py @@ -12,6 +12,7 @@ from cdp_use.cdp.target.types import SessionID, TargetID, TargetInfo from uuid_extensions import uuid7str from browser_use.dom.utils import cap_text_length +from browser_use.observability import observe_debug # Serializer types DEFAULT_INCLUDE_ATTRIBUTES = [ @@ -412,6 +413,25 @@ class EnhancedDOMTreeNode: return f'<{self.tag_name}>{cap_text_length(self.get_all_children_text(), max_text_length) or ""}' + def get_meaningful_text_for_llm(self) -> str: + """ + Get the meaningful text content that the LLM actually sees for this element. + This matches exactly what goes into the DOMTreeSerializer output. + """ + meaningful_text = '' + if hasattr(self, 'attributes') and self.attributes: + # Priority order: value, aria-label, title, placeholder, alt, text content + for attr in ['value', 'aria-label', 'title', 'placeholder', 'alt']: + if attr in self.attributes and self.attributes[attr]: + meaningful_text = self.attributes[attr] + break + + # Fallback to text content if no meaningful attributes + if not meaningful_text: + meaningful_text = self.get_all_children_text() + + return meaningful_text.strip() + @property def is_actually_scrollable(self) -> bool: """ @@ -677,6 +697,7 @@ class SerializedDOMState: selector_map: DOMSelectorMap + @observe_debug(ignore_input=True, ignore_output=True, name='llm_representation') def llm_representation( self, include_attributes: list[str] | None = None, From 7b0e216ff015f79435cc775f08b4383ec3f97d3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 21:36:25 -0700 Subject: [PATCH 53/69] Update font size and padding adjustments in bounding box drawing for enhanced visibility - Increased font size in `draw_enhanced_bounding_box_with_text` from 30 to 36 for improved readability. - Adjusted padding from 8 to 6 to accommodate the larger font while maintaining visual clarity. - Enhanced text positioning logic to prevent clipping and ensure proper alignment within the bounding box. --- browser_use/browser/python_highlights.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index d22dc303d..1b6ee2917 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -95,7 +95,7 @@ def draw_enhanced_bounding_box_with_text( try: # Use much bigger font size for visible index boxes big_font = None - font_size = 30 # Much bigger, more visible size + font_size = 36 # Much bigger, more visible size try: big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) except (OSError, IOError): @@ -119,8 +119,8 @@ def draw_enhanced_bounding_box_with_text( text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] - # Bigger padding for more prominent index boxes - padding = 8 + # Bigger padding for more prominent index boxes - extra space for large font + padding = 6 element_width = x2 - x1 element_height = y2 - y1 @@ -141,9 +141,10 @@ def draw_enhanced_bounding_box_with_text( bg_x2 = bg_x1 + container_width bg_y2 = bg_y1 + container_height - # Center the number within the index box + # Center the number within the index box with proper baseline handling text_x = bg_x1 + (container_width - text_width) // 2 - text_y = bg_y1 + (container_height - text_height) // 2 + # Add extra vertical space to prevent clipping + text_y = bg_y1 + (container_height - text_height) // 2 - bbox_text[1] # Subtract top offset # Ensure container stays within image bounds img_width, img_height = image_size From 4a62fe11f916f3e57def1f6dd208d1a3e0778609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 21:42:14 -0700 Subject: [PATCH 54/69] More logging --- browser_use/browser/session.py | 3 +++ browser_use/browser/watchdogs/dom_watchdog.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index b6ccb06f1..60e8dbd64 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -813,6 +813,7 @@ class BrowserSession(BaseModel): assert self._cdp_client_root is not None, 'CDP client not initialized - browser may not be connected yet' return self._cdp_client_root + @observe_debug(ignore_input=True, ignore_output=True, name='get_or_create_cdp_session') async def get_or_create_cdp_session( self, target_id: TargetID | None = None, focus: bool = True, new_socket: bool | None = None ) -> CDPSession: @@ -1345,6 +1346,7 @@ class BrowserSession(BaseModel): except Exception as e: self.logger.debug(f'Skipping proxy auth setup: {type(e).__name__}: {e}') + @observe_debug(ignore_input=True, ignore_output=True, name='get_tabs') async def get_tabs(self) -> list[TabInfo]: """Get information about all open tabs using CDP Target.getTargetInfo for speed.""" tabs = [] @@ -1423,6 +1425,7 @@ class BrowserSession(BaseModel): return target return None + @observe_debug(ignore_input=True, ignore_output=True, name='get_current_page_url') async def get_current_page_url(self) -> str: """Get the URL of the current page using CDP.""" target = await self.get_current_target_info() diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index 2558dc777..10ed2c1fc 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -16,6 +16,7 @@ from browser_use.dom.views import ( EnhancedDOMTreeNode, SerializedDOMState, ) +from browser_use.observability import observe_debug if TYPE_CHECKING: from browser_use.browser.views import BrowserStateSummary, PageInfo @@ -424,6 +425,7 @@ class DOMWatchdog(BaseWatchdog): ) raise + @observe_debug(ignore_input=True, ignore_output=True, name='build_dom_tree_without_highlights') async def _build_dom_tree_without_highlights(self, previous_state: SerializedDOMState | None = None) -> SerializedDOMState: """Build DOM tree without injecting JavaScript highlights (for parallel execution).""" try: @@ -471,6 +473,7 @@ class DOMWatchdog(BaseWatchdog): ) raise + @observe_debug(ignore_input=True, ignore_output=True, name='capture_clean_screenshot') async def _capture_clean_screenshot(self) -> str: """Capture a clean screenshot without JavaScript highlights.""" try: @@ -524,6 +527,7 @@ class DOMWatchdog(BaseWatchdog): elapsed = time.time() - start_time self.logger.debug(f'✅ Page stability wait completed in {elapsed:.2f}s') + @observe_debug(ignore_input=True, ignore_output=True, name='get_page_info') async def _get_page_info(self) -> 'PageInfo': """Get comprehensive page information using a single CDP call. From d8b23062b961f753eef31098776e20fc7274eaea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 21:46:45 -0700 Subject: [PATCH 55/69] Formatter --- browser_use/browser/python_highlights.py | 31 ++++++++++++------------ 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 1b6ee2917..5be9f1bc9 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -7,7 +7,6 @@ to draw bounding boxes around interactive elements directly on screenshots. import base64 import io import logging -from typing import Optional, Tuple from PIL import Image, ImageDraw, ImageFont @@ -36,7 +35,7 @@ ELEMENT_TYPE_MAP = { } -def get_element_color(tag_name: str, element_type: Optional[str] = None) -> str: +def get_element_color(tag_name: str, element_type: str | None = None) -> str: """Get color for element based on tag name and type.""" # Check input type first if tag_name == 'input' and element_type: @@ -47,19 +46,19 @@ def get_element_color(tag_name: str, element_type: Optional[str] = None) -> str: return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default']) -def should_show_index_overlay(element_index: Optional[int]) -> bool: +def should_show_index_overlay(element_index: int | None) -> bool: """Determine if index overlay should be shown.""" return element_index is not None def draw_enhanced_bounding_box_with_text( draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues - bbox: Tuple[int, int, int, int], + bbox: tuple[int, int, int, int], color: str, - text: Optional[str] = None, - font: Optional[ImageFont.FreeTypeFont] = None, + text: str | None = None, + font: ImageFont.FreeTypeFont | None = None, element_type: str = 'div', - image_size: Tuple[int, int] = (2000, 1500), + image_size: tuple[int, int] = (2000, 1500), ) -> None: """Draw an enhanced bounding box with much bigger index containers and dashed borders.""" x1, y1, x2, y2 = bbox @@ -98,14 +97,14 @@ def draw_enhanced_bounding_box_with_text( font_size = 36 # Much bigger, more visible size try: big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) - except (OSError, IOError): + except OSError: try: big_font = ImageFont.truetype('arial.ttf', font_size) - except (OSError, IOError): + except OSError: # Try system fonts on different platforms try: big_font = ImageFont.truetype('Arial Bold.ttf', font_size) - except (OSError, IOError): + except OSError: big_font = font # Fallback to original font # Get text size with bigger font @@ -181,10 +180,10 @@ def draw_enhanced_bounding_box_with_text( def draw_bounding_box_with_text( draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues - bbox: Tuple[int, int, int, int], + bbox: tuple[int, int, int, int], color: str, - text: Optional[str] = None, - font: Optional[ImageFont.FreeTypeFont] = None, + text: str | None = None, + font: ImageFont.FreeTypeFont | None = None, ) -> None: """Draw a bounding box with optional text overlay.""" x1, y1, x2, y2 = bbox @@ -318,10 +317,10 @@ def create_highlighted_screenshot( font = None try: font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12) - except (OSError, IOError): + except OSError: try: font = ImageFont.truetype('arial.ttf', 12) - except (OSError, IOError): + except OSError: font = None # Use default font # Process each interactive element @@ -397,7 +396,7 @@ def create_highlighted_screenshot( return screenshot_b64 -async def get_viewport_info_from_cdp(cdp_session) -> Tuple[float, int, int]: +async def get_viewport_info_from_cdp(cdp_session) -> tuple[float, int, int]: """Get viewport information from CDP session. Returns: From e570acfefc576ceae3c927232da70b8d2a2a1dba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 21:54:16 -0700 Subject: [PATCH 56/69] Enhance DOM service initialization in DOMWatchdog - Updated the instantiation of `DomService` to include `cross_origin_iframes` parameter from the browser session profile, improving its configuration for handling cross-origin iframes. --- browser_use/browser/watchdogs/dom_watchdog.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index 10ed2c1fc..e3605d5c5 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -433,7 +433,11 @@ class DOMWatchdog(BaseWatchdog): # Create or reuse DOM service if self._dom_service is None: - self._dom_service = DomService(browser_session=self.browser_session, logger=self.logger) + self._dom_service = DomService( + browser_session=self.browser_session, + logger=self.logger, + cross_origin_iframes=self.browser_session.browser_profile.cross_origin_iframes, + ) # Get serialized DOM tree using the service self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: Calling DomService.get_serialized_dom_tree...') From ce0876adb7fbde43ca2aae1741bb72f98f47dc60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 22:17:50 -0700 Subject: [PATCH 57/69] highlight in parallel --- browser_use/browser/python_highlights.py | 138 ++++++++++-------- browser_use/browser/session.py | 3 +- browser_use/browser/watchdogs/dom_watchdog.py | 8 +- 3 files changed, 89 insertions(+), 60 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 5be9f1bc9..47a7a1ddb 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -4,6 +4,7 @@ This module replaces JavaScript-based highlighting with fast Python image proces to draw bounding boxes around interactive elements directly on screenshots. """ +import asyncio import base64 import io import logging @@ -12,6 +13,7 @@ from PIL import Image, ImageDraw, ImageFont from browser_use.dom.views import DOMSelectorMap from browser_use.observability import observe_debug +from browser_use.utils import time_execution_async logger = logging.getLogger(__name__) @@ -284,8 +286,74 @@ def draw_bounding_box_with_text( logger.debug(f'Failed to draw text overlay: {e}') +async def process_element_highlight( + element_id: int, + element, + draw, + device_pixel_ratio: float, + font, + filter_highlight_ids: bool, + image_size: tuple[int, int], +) -> None: + """Process a single element for highlighting in parallel.""" + try: + # Use absolute_position coordinates directly + if not element.absolute_position: + return + + bounds = element.absolute_position + + # Scale coordinates from CSS pixels to device pixels for screenshot + # The screenshot is captured at device pixel resolution, but coordinates are in CSS pixels + x1 = int(bounds.x * device_pixel_ratio) + y1 = int(bounds.y * device_pixel_ratio) + x2 = int((bounds.x + bounds.width) * device_pixel_ratio) + y2 = int((bounds.y + bounds.height) * device_pixel_ratio) + + # Ensure coordinates are within image bounds + img_width, img_height = image_size + x1 = max(0, min(x1, img_width)) + y1 = max(0, min(y1, img_height)) + x2 = max(x1, min(x2, img_width)) + y2 = max(y1, min(y2, img_height)) + + # Skip if bounding box is too small or invalid + if x2 - x1 < 2 or y2 - y1 < 2: + return + + # Get element color based on type + tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div' + element_type = None + if hasattr(element, 'attributes') and element.attributes: + element_type = element.attributes.get('type') + + color = get_element_color(tag_name, element_type) + + # Get element index for overlay and apply filtering + element_index = getattr(element, 'element_index', None) + index_text = None + + if element_index is not None: + if filter_highlight_ids: + # Use the meaningful text that matches what the LLM sees + meaningful_text = element.get_meaningful_text_for_llm() + # Show ID only if meaningful text is less than 5 characters + if len(meaningful_text) < 5: + index_text = str(element_index) + else: + # Always show ID when filter is disabled + index_text = str(element_index) + + # Draw enhanced bounding box with bigger index + draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name, image_size) + + except Exception as e: + logger.debug(f'Failed to draw highlight for element {element_id}: {e}') + + @observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot') -def create_highlighted_screenshot( +@time_execution_async('create_highlighted_screenshot') +async def create_highlighted_screenshot( screenshot_b64: str, selector_map: DOMSelectorMap, device_pixel_ratio: float = 1.0, @@ -323,62 +391,17 @@ def create_highlighted_screenshot( except OSError: font = None # Use default font - # Process each interactive element + # Process elements in parallel for better performance + tasks = [] for element_id, element in selector_map.items(): - try: - # Use absolute_position coordinates directly - if not element.absolute_position: - continue + task = process_element_highlight( + element_id, element, draw, device_pixel_ratio, font, filter_highlight_ids, image.size + ) + tasks.append(task) - bounds = element.absolute_position - - # Scale coordinates from CSS pixels to device pixels for screenshot - # The screenshot is captured at device pixel resolution, but coordinates are in CSS pixels - x1 = int(bounds.x * device_pixel_ratio) - y1 = int(bounds.y * device_pixel_ratio) - x2 = int((bounds.x + bounds.width) * device_pixel_ratio) - y2 = int((bounds.y + bounds.height) * device_pixel_ratio) - - # Ensure coordinates are within image bounds - img_width, img_height = image.size - x1 = max(0, min(x1, img_width)) - y1 = max(0, min(y1, img_height)) - x2 = max(x1, min(x2, img_width)) - y2 = max(y1, min(y2, img_height)) - - # Skip if bounding box is too small or invalid - if x2 - x1 < 2 or y2 - y1 < 2: - continue - - # Get element color based on type - tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div' - element_type = None - if hasattr(element, 'attributes') and element.attributes: - element_type = element.attributes.get('type') - - color = get_element_color(tag_name, element_type) - - # Get element index for overlay and apply filtering - element_index = getattr(element, 'element_index', None) - index_text = None - - if element_index is not None: - if filter_highlight_ids: - # Use the meaningful text that matches what the LLM sees - meaningful_text = element.get_meaningful_text_for_llm() - # Show ID only if meaningful text is less than 5 characters - if len(meaningful_text) < 5: - index_text = str(element_index) - else: - # Always show ID when filter is disabled - index_text = str(element_index) - - # Draw enhanced bounding box with bigger index - draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name, image.size) - - except Exception as e: - logger.debug(f'Failed to draw highlight for element {element_id}: {e}') - continue + # Execute all element processing tasks in parallel + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) # Convert back to base64 output_buffer = io.BytesIO() @@ -428,6 +451,7 @@ async def get_viewport_info_from_cdp(cdp_session) -> tuple[float, int, int]: @observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot_async') +@time_execution_async('create_highlighted_screenshot_async') async def create_highlighted_screenshot_async( screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None, filter_highlight_ids: bool = True ) -> str: @@ -452,7 +476,7 @@ async def create_highlighted_screenshot_async( except Exception as e: logger.debug(f'Failed to get viewport info from CDP: {e}') - # Create highlighted screenshot (run in thread pool if needed for performance) - return create_highlighted_screenshot( + # Create highlighted screenshot with async processing + return await create_highlighted_screenshot( screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y, filter_highlight_ids ) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 60e8dbd64..5e6e9d578 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -40,7 +40,7 @@ from browser_use.browser.profile import BrowserProfile, ProxySettings from browser_use.browser.views import BrowserStateSummary, TabInfo from browser_use.dom.views import EnhancedDOMTreeNode, TargetInfo from browser_use.observability import observe_debug -from browser_use.utils import _log_pretty_url, is_new_tab_page +from browser_use.utils import _log_pretty_url, is_new_tab_page, time_execution_async DEFAULT_BROWSER_PROFILE = BrowserProfile() @@ -813,6 +813,7 @@ class BrowserSession(BaseModel): assert self._cdp_client_root is not None, 'CDP client not initialized - browser may not be connected yet' return self._cdp_client_root + @time_execution_async('get_or_create_cdp_session') @observe_debug(ignore_input=True, ignore_output=True, name='get_or_create_cdp_session') async def get_or_create_cdp_session( self, target_id: TargetID | None = None, focus: bool = True, new_socket: bool | None = None diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index e3605d5c5..af6205211 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -4,6 +4,8 @@ import asyncio import time from typing import TYPE_CHECKING +from utils import time_execution_async + from browser_use.browser.events import ( BrowserErrorEvent, BrowserStateRequestEvent, @@ -232,7 +234,7 @@ class DOMWatchdog(BaseWatchdog): # Get CDP session for viewport info cdp_session = await self.browser_session.get_or_create_cdp_session() - + start = time.time() screenshot_b64 = await create_highlighted_screenshot_async( screenshot_b64, content.selector_map, @@ -240,7 +242,7 @@ class DOMWatchdog(BaseWatchdog): self.browser_session.browser_profile.filter_highlight_ids, ) self.logger.debug( - f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements' + f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements in {time.time() - start:.2f}s' ) except Exception as e: self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}') @@ -425,6 +427,7 @@ class DOMWatchdog(BaseWatchdog): ) raise + @time_execution_async('build_dom_tree_without_highlights') @observe_debug(ignore_input=True, ignore_output=True, name='build_dom_tree_without_highlights') async def _build_dom_tree_without_highlights(self, previous_state: SerializedDOMState | None = None) -> SerializedDOMState: """Build DOM tree without injecting JavaScript highlights (for parallel execution).""" @@ -477,6 +480,7 @@ class DOMWatchdog(BaseWatchdog): ) raise + @time_execution_async('capture_clean_screenshot') @observe_debug(ignore_input=True, ignore_output=True, name='capture_clean_screenshot') async def _capture_clean_screenshot(self) -> str: """Capture a clean screenshot without JavaScript highlights.""" From 25823ce67085ccd6cc00e0fb68ba2fe22ffc6a5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 22:25:32 -0700 Subject: [PATCH 58/69] make highlights sync --- browser_use/browser/python_highlights.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index 47a7a1ddb..a27a0f80e 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -4,7 +4,6 @@ This module replaces JavaScript-based highlighting with fast Python image proces to draw bounding boxes around interactive elements directly on screenshots. """ -import asyncio import base64 import io import logging @@ -286,7 +285,7 @@ def draw_bounding_box_with_text( logger.debug(f'Failed to draw text overlay: {e}') -async def process_element_highlight( +def process_element_highlight( element_id: int, element, draw, @@ -295,7 +294,7 @@ async def process_element_highlight( filter_highlight_ids: bool, image_size: tuple[int, int], ) -> None: - """Process a single element for highlighting in parallel.""" + """Process a single element for highlighting.""" try: # Use absolute_position coordinates directly if not element.absolute_position: @@ -391,17 +390,10 @@ async def create_highlighted_screenshot( except OSError: font = None # Use default font - # Process elements in parallel for better performance - tasks = [] + # Process elements sequentially to avoid ImageDraw thread safety issues + # PIL ImageDraw is not thread-safe, so we process elements one by one for element_id, element in selector_map.items(): - task = process_element_highlight( - element_id, element, draw, device_pixel_ratio, font, filter_highlight_ids, image.size - ) - tasks.append(task) - - # Execute all element processing tasks in parallel - if tasks: - await asyncio.gather(*tasks, return_exceptions=True) + process_element_highlight(element_id, element, draw, device_pixel_ratio, font, filter_highlight_ids, image.size) # Convert back to base64 output_buffer = io.BytesIO() From aad4b932bfce2f690fdedcf7874ee0835f64b7cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 29 Aug 2025 22:32:05 -0700 Subject: [PATCH 59/69] time_execution_async import --- browser_use/browser/python_highlights.py | 16 +++++++++------- browser_use/browser/watchdogs/dom_watchdog.py | 3 +-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py index a27a0f80e..541313e0e 100644 --- a/browser_use/browser/python_highlights.py +++ b/browser_use/browser/python_highlights.py @@ -93,18 +93,20 @@ def draw_enhanced_bounding_box_with_text( # Draw much bigger index overlay if we have index text if text: try: - # Use much bigger font size for visible index boxes + # Scale font size based on image dimensions for consistent appearance across viewports + img_width, img_height = image_size + # Base font size scales with viewport width (36px for 1200px viewport) + base_font_size = max(16, min(48, int(img_width * 0.03))) # 3% of viewport width big_font = None - font_size = 36 # Much bigger, more visible size try: - big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', font_size) + big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', base_font_size) except OSError: try: - big_font = ImageFont.truetype('arial.ttf', font_size) + big_font = ImageFont.truetype('arial.ttf', base_font_size) except OSError: # Try system fonts on different platforms try: - big_font = ImageFont.truetype('Arial Bold.ttf', font_size) + big_font = ImageFont.truetype('Arial Bold.ttf', base_font_size) except OSError: big_font = font # Fallback to original font @@ -119,8 +121,8 @@ def draw_enhanced_bounding_box_with_text( text_width = bbox_text[2] - bbox_text[0] text_height = bbox_text[3] - bbox_text[1] - # Bigger padding for more prominent index boxes - extra space for large font - padding = 6 + # Scale padding based on viewport size for consistent appearance + padding = max(4, int(img_width * 0.005)) # 0.5% of viewport width element_width = x2 - x1 element_height = y2 - y1 diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py index af6205211..33cad6257 100644 --- a/browser_use/browser/watchdogs/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -4,8 +4,6 @@ import asyncio import time from typing import TYPE_CHECKING -from utils import time_execution_async - from browser_use.browser.events import ( BrowserErrorEvent, BrowserStateRequestEvent, @@ -19,6 +17,7 @@ from browser_use.dom.views import ( SerializedDOMState, ) from browser_use.observability import observe_debug +from browser_use.utils import time_execution_async if TYPE_CHECKING: from browser_use.browser.views import BrowserStateSummary, PageInfo From b177f3166ea78dbc5498f98f055202c22cc0ae7d Mon Sep 17 00:00:00 2001 From: mertunsall Date: Sat, 30 Aug 2025 20:50:42 +0200 Subject: [PATCH 60/69] better exception handling from browser events -> LLM --- browser_use/browser/views.py | 31 +- .../watchdogs/default_action_watchdog.py | 427 ++++++++++-------- browser_use/tools/registry/service.py | 2 + browser_use/tools/service.py | 153 +++---- pyproject.toml | 2 +- 5 files changed, 352 insertions(+), 263 deletions(-) diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py index 355c542ff..dcb5606db 100644 --- a/browser_use/browser/views.py +++ b/browser_use/browser/views.py @@ -126,17 +126,42 @@ class BrowserStateHistory: class BrowserError(Exception): - """Base class for all browser errors""" + """Browser error with structured memory for LLM context management. + + This exception class provides separate memory contexts for browser actions: + - short_term_memory: Immediate context shown once to the LLM for the next action + - long_term_memory: Persistent error information stored across steps + """ message: str + short_term_memory: str | None = None + long_term_memory: str | None = None details: dict[str, Any] | None = None while_handling_event: BaseEvent[Any] | None = None - def __init__(self, message: str, details: dict[str, Any] | None = None, event: BaseEvent[Any] | None = None): + def __init__( + self, + message: str, + short_term_memory: str | None = None, + long_term_memory: str | None = None, + details: dict[str, Any] | None = None, + event: BaseEvent[Any] | None = None, + ): + """Initialize a BrowserError with structured memory contexts. + + Args: + message: Technical error message for logging and debugging + short_term_memory: Context shown once to LLM (e.g., available actions, options) + long_term_memory: Persistent error info stored in agent memory + details: Additional metadata for debugging + event: The browser event that triggered this error + """ self.message = message - super().__init__(message) + self.short_term_memory = short_term_memory + self.long_term_memory = long_term_memory or message # Fallback to message if not provided self.details = details self.while_handling_event = event + super().__init__(message) def __str__(self) -> str: if self.details: diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 476b629a4..9cfc8c19e 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -58,7 +58,8 @@ class DefaultActionWatchdog(BaseWatchdog): msg = f'Index {index_for_logging} - has an element which opens file upload dialog. To upload files please use a specific function to upload files' self.logger.info(msg) raise BrowserError( - 'Click triggered a file input element which could not be handled, use the dedicated file upload function instead' + message=msg, + long_term_memory=msg, ) # Perform the actual click using internal implementation @@ -230,13 +231,18 @@ class DefaultActionWatchdog(BaseWatchdog): self.logger.warning( f'Cannot click on elements. Use get_dropdown_options(index={element_node.element_index}) action instead.' + msg = f'Cannot click on elements. Use get_dropdown_options(index={element_node.element_index}) action instead.' - ) msg = f'Cannot click on elements.' in str(e): try: return await get_dropdown_options( params=GetDropdownOptionsAction(index=params.index), browser_session=browser_session @@ -301,6 +296,9 @@ class Tools(Generic[Context]): f'Failed to get dropdown options as shortcut during click_element_by_index on dropdown: {type(dropdown_error).__name__}: {dropdown_error}' ) + return handle_browser_error(e) + except Exception as e: + error_msg = f'Failed to click element {params.index}: {str(e)}' return ActionResult(error=error_msg) @self.registry.action(