diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b4be461fb..1d8a1f6ac 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -172,6 +172,11 @@ jobs: const score = `${passed}/${total}`; const percentage = Math.round((passed / total) * 100); + // Fail the workflow if 0% pass rate + if (percentage === 0) { + core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`); + } + // Create detailed table let tableRows = ''; detailedResults.forEach(result => { diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index 9e68ff31e..e4455186a 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -212,10 +212,16 @@ class MessageManager: # Build the history item if model_output is None: - # Only add error history item if we have a valid step number - if step_number is not None and step_number > 0: - history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.') - self.state.agent_history_items.append(history_item) + # Add history item for initial actions (step 0) or errors (step > 0) + if step_number is not None: + if step_number == 0 and action_results: + # Step 0 with initial action results + history_item = HistoryItem(step_number=step_number, action_results=action_results) + self.state.agent_history_items.append(history_item) + elif step_number > 0: + # Error case for steps > 0 + history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.') + self.state.agent_history_items.append(history_item) else: history_item = HistoryItem( step_number=step_number, diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index a518b6fc3..98b01e8c3 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -270,8 +270,19 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Action setup self._setup_action_models() self._set_browser_use_version_and_source(source) - self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None + initial_url = None + + # only load url if no initial actions are provided + if self.directly_open_url and not self.state.follow_up_task and not initial_actions: + initial_url = self._extract_url_from_task(self.task) + if initial_url: + self.logger.info(f'๐Ÿ”— Found URL in task: {initial_url}, adding as initial action...') + initial_actions = [{'go_to_url': {'url': initial_url, 'new_tab': False}}] + + self.initial_url = initial_url + + self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None # Verify we can connect to the model self._verify_and_setup_llm() @@ -588,7 +599,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): if hasattr(self, 'cloud_sync') and self.cloud_sync and self.enable_cloud_sync: self.eventbus.on('*', self.cloud_sync.handle_event) - @observe_debug(ignore_input=True, ignore_output=True, name='_raise_if_stopped_or_paused') async def _raise_if_stopped_or_paused(self) -> None: """Utility function that raises an InterruptedError if the agent is stopped or paused.""" @@ -635,14 +645,10 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug(f'๐ŸŒ Step {self.state.n_steps}: Getting browser state...') # Always take screenshots for all steps - # Use caching based on directly_open_url setting - if directly_open_url is False, don't use cached state - is_first_step = self.state.n_steps in (0, 1) - use_cache = is_first_step and self.directly_open_url - self.logger.debug(f'๐Ÿ“ธ Requesting browser state with include_screenshot=True, cached={use_cache}') + self.logger.debug('๐Ÿ“ธ Requesting browser state with include_screenshot=True') browser_state_summary = await self.browser_session.get_browser_state_summary( cache_clickable_elements_hashes=True, include_screenshot=True, # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway) - cached=use_cache, include_recent_events=self.include_recent_events, ) if browser_state_summary.screenshot: @@ -1160,7 +1166,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): unique_urls = list(set(found_urls)) # If multiple URLs found, skip directly_open_urling if len(unique_urls) > 1: - self.logger.debug(f'๐Ÿ“ Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity') + self.logger.debug(f'Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity') return None # If exactly one URL found, return it @@ -1239,45 +1245,13 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug('๐Ÿ”ง Browser session started with watchdogs attached') - # Check if task contains a URL and add it as an initial action (only if directly_open_url is enabled) - if self.directly_open_url and not self.state.follow_up_task: - initial_url = self._extract_url_from_task(self.task) - if initial_url: - self.logger.info(f'๐Ÿ”— Found URL in task: {initial_url}, adding as initial action...') + # Ensure browser focus is properly established before executing initial actions + if self.browser_session and self.browser_session.agent_focus: + self.logger.debug(f'๐ŸŽฏ Browser focus established on target: {self.browser_session.agent_focus.target_id[-4:]}') + else: + self.logger.warning('โš ๏ธ No browser focus established, may cause navigation issues') - # Create a go_to_url action for the initial URL - go_to_url_action = { - 'go_to_url': { - 'url': initial_url, - 'new_tab': False, # Navigate in current tab - } - } - - # Add to initial_actions or create new list if none exist - if self.initial_actions: - # Convert back to dict format, prepend URL navigation, then convert back - initial_actions_dicts = [] - for action in self.initial_actions: - action_data = action.model_dump(exclude_unset=True) - initial_actions_dicts.append(action_data) - - # Prepend the go_to_url action - initial_actions_dicts = [go_to_url_action] + initial_actions_dicts - - # Convert back to ActionModel instances - self.initial_actions = self._convert_initial_actions(initial_actions_dicts) - else: - # Create new initial_actions with just the go_to_url - self.initial_actions = self._convert_initial_actions([go_to_url_action]) - - self.logger.debug(f'โœ… Added navigation to {initial_url} as initial action') - - # Execute initial actions if provided - if self.initial_actions and not self.state.follow_up_task: - self.logger.debug(f'โšก Executing {len(self.initial_actions)} initial actions...') - result = await self.multi_act(self.initial_actions, check_for_new_elements=False) - self.state.last_result = result - self.logger.debug('โœ… Initial actions completed') + await self._execute_initial_actions() self.logger.debug(f'๐Ÿ”„ Starting main execution loop with max {max_steps} steps...') for step in range(max_steps): @@ -1519,6 +1493,8 @@ class Agent(Generic[Context, AgentStructuredOutput]): new_element_hashes = {e.parent_branch_hash() for e in new_selector_map.values()} if check_for_new_elements and not new_element_hashes.issubset(cached_element_hashes): # next action requires index but there are new elements on the page + # log difference in len debug + self.logger.debug(f'New elements: {abs(len(new_element_hashes) - len(cached_element_hashes))}') remaining_actions_str = get_remaining_actions_str(actions, i) msg = f'Something new appeared after action {i} / {total_actions}: actions {remaining_actions_str} were not executed' logger.info(msg) @@ -1653,6 +1629,17 @@ class Agent(Generic[Context, AgentStructuredOutput]): return results + async def _execute_initial_actions(self) -> None: + # Execute initial actions if provided + if self.initial_actions and not self.state.follow_up_task: + self.logger.debug(f'โšก Executing {len(self.initial_actions)} initial actions...') + result = await self.multi_act(self.initial_actions, check_for_new_elements=False) + # update result 1 to mention that its was automatically loaded + if result and self.initial_url and result[0].long_term_memory: + result[0].long_term_memory = f'Found initial url and automatically loaded it. {result[0].long_term_memory}' + self.state.last_result = result + self.logger.debug('Initial actions completed') + async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]: """Execute a single step from history with element validation""" assert self.browser_session is not None, 'BrowserSession is not set up' diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index 72a53b33a..86e4eada5 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -583,9 +583,14 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro # --- UI/viewport/DOM --- highlight_elements: bool = Field(default=True, description='Highlight interactive elements on the page.') + filter_highlight_ids: bool = Field( + default=True, description='Only show element IDs in highlights if llm_representation is less than 10 characters.' + ) # --- Downloads --- - auto_download_pdfs: bool = Field(default=True, description='Automatically download PDFs when navigating to PDF viewer pages.') + auto_download_pdfs: bool = Field( + default=False, description='Automatically download PDFs when navigating to PDF viewer pages.' + ) profile_directory: str = 'Default' # e.g. 'Profile 1', 'Profile 2', 'Custom Profile', etc. diff --git a/browser_use/browser/python_highlights.py b/browser_use/browser/python_highlights.py new file mode 100644 index 000000000..541313e0e --- /dev/null +++ b/browser_use/browser/python_highlights.py @@ -0,0 +1,476 @@ +"""Python-based highlighting system for drawing bounding boxes on screenshots. + +This module replaces JavaScript-based highlighting with fast Python image processing +to draw bounding boxes around interactive elements directly on screenshots. +""" + +import base64 +import io +import logging + +from PIL import Image, ImageDraw, ImageFont + +from browser_use.dom.views import DOMSelectorMap +from browser_use.observability import observe_debug +from browser_use.utils import time_execution_async + +logger = logging.getLogger(__name__) + +# Color scheme for different element types +ELEMENT_COLORS = { + 'button': '#FF6B6B', # Red for buttons + 'input': '#4ECDC4', # Teal for inputs + 'select': '#45B7D1', # Blue for dropdowns + 'a': '#96CEB4', # Green for links + 'textarea': '#FF8C42', # Orange for text areas (was yellow, now more visible) + 'default': '#DDA0DD', # Light purple for other interactive elements +} + +# Element type mappings +ELEMENT_TYPE_MAP = { + 'button': 'button', + 'input': 'input', + 'select': 'select', + 'a': 'a', + 'textarea': 'textarea', +} + + +def get_element_color(tag_name: str, element_type: str | None = None) -> str: + """Get color for element based on tag name and type.""" + # Check input type first + if tag_name == 'input' and element_type: + if element_type in ['button', 'submit']: + return ELEMENT_COLORS['button'] + + # Use tag-based color + return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default']) + + +def should_show_index_overlay(element_index: int | None) -> bool: + """Determine if index overlay should be shown.""" + return element_index is not None + + +def draw_enhanced_bounding_box_with_text( + draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues + bbox: tuple[int, int, int, int], + color: str, + text: str | None = None, + font: ImageFont.FreeTypeFont | None = None, + element_type: str = 'div', + image_size: tuple[int, int] = (2000, 1500), +) -> None: + """Draw an enhanced bounding box with much bigger index containers and dashed borders.""" + x1, y1, x2, y2 = bbox + + # Draw dashed bounding box with pattern: 1 line, 2 spaces, 1 line, 2 spaces... + dash_length = 4 + gap_length = 8 + line_width = 2 + + # Helper function to draw dashed line + def draw_dashed_line(start_x, start_y, end_x, end_y): + if start_x == end_x: # Vertical line + y = start_y + while y < end_y: + dash_end = min(y + dash_length, end_y) + draw.line([(start_x, y), (start_x, dash_end)], fill=color, width=line_width) + y += dash_length + gap_length + else: # Horizontal line + x = start_x + while x < end_x: + dash_end = min(x + dash_length, end_x) + draw.line([(x, start_y), (dash_end, start_y)], fill=color, width=line_width) + x += dash_length + gap_length + + # Draw dashed rectangle + draw_dashed_line(x1, y1, x2, y1) # Top + draw_dashed_line(x2, y1, x2, y2) # Right + draw_dashed_line(x2, y2, x1, y2) # Bottom + draw_dashed_line(x1, y2, x1, y1) # Left + + # Draw much bigger index overlay if we have index text + if text: + try: + # Scale font size based on image dimensions for consistent appearance across viewports + img_width, img_height = image_size + # Base font size scales with viewport width (36px for 1200px viewport) + base_font_size = max(16, min(48, int(img_width * 0.03))) # 3% of viewport width + big_font = None + try: + big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', base_font_size) + except OSError: + try: + big_font = ImageFont.truetype('arial.ttf', base_font_size) + except OSError: + # Try system fonts on different platforms + try: + big_font = ImageFont.truetype('Arial Bold.ttf', base_font_size) + except OSError: + big_font = font # Fallback to original font + + # Get text size with bigger font + if big_font: + bbox_text = draw.textbbox((0, 0), text, font=big_font) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + else: + # Fallback for default font + bbox_text = draw.textbbox((0, 0), text) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + + # Scale padding based on viewport size for consistent appearance + padding = max(4, int(img_width * 0.005)) # 0.5% of viewport width + element_width = x2 - x1 + element_height = y2 - y1 + + # Container dimensions + container_width = text_width + padding * 2 + container_height = text_height + padding * 2 + + # Position in top-left corner (inside if fits, outside if too small) + if element_width >= container_width and element_height >= container_height: + # Place inside top-left corner + bg_x1 = x1 + 2 # Small offset from edge + bg_y1 = y1 + 2 + else: + # Place outside top-left corner + bg_x1 = x1 + bg_y1 = max(0, y1 - container_height) + + bg_x2 = bg_x1 + container_width + bg_y2 = bg_y1 + container_height + + # Center the number within the index box with proper baseline handling + text_x = bg_x1 + (container_width - text_width) // 2 + # Add extra vertical space to prevent clipping + text_y = bg_y1 + (container_height - text_height) // 2 - bbox_text[1] # Subtract top offset + + # Ensure container stays within image bounds + img_width, img_height = image_size + if bg_x1 < 0: + offset = -bg_x1 + bg_x1 += offset + bg_x2 += offset + text_x += offset + if bg_y1 < 0: + offset = -bg_y1 + bg_y1 += offset + bg_y2 += offset + text_y += offset + if bg_x2 > img_width: + offset = bg_x2 - img_width + bg_x1 -= offset + bg_x2 -= offset + text_x -= offset + if bg_y2 > img_height: + offset = bg_y2 - img_height + bg_y1 -= offset + bg_y2 -= offset + text_y -= offset + + # Draw bigger background rectangle with thicker border + draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=2) + + # Draw white text centered in the index box + draw.text((text_x, text_y), text, fill='white', font=big_font or font) + + except Exception as e: + logger.debug(f'Failed to draw enhanced text overlay: {e}') + + +def draw_bounding_box_with_text( + draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues + bbox: tuple[int, int, int, int], + color: str, + text: str | None = None, + font: ImageFont.FreeTypeFont | None = None, +) -> None: + """Draw a bounding box with optional text overlay.""" + x1, y1, x2, y2 = bbox + + # Draw dashed bounding box + dash_length = 2 + gap_length = 6 + + # Top edge + x = x1 + while x < x2: + end_x = min(x + dash_length, x2) + draw.line([(x, y1), (end_x, y1)], fill=color, width=2) + draw.line([(x, y1 + 1), (end_x, y1 + 1)], fill=color, width=2) + x += dash_length + gap_length + + # Bottom edge + x = x1 + while x < x2: + end_x = min(x + dash_length, x2) + draw.line([(x, y2), (end_x, y2)], fill=color, width=2) + draw.line([(x, y2 - 1), (end_x, y2 - 1)], fill=color, width=2) + x += dash_length + gap_length + + # Left edge + y = y1 + while y < y2: + end_y = min(y + dash_length, y2) + draw.line([(x1, y), (x1, end_y)], fill=color, width=2) + draw.line([(x1 + 1, y), (x1 + 1, end_y)], fill=color, width=2) + y += dash_length + gap_length + + # Right edge + y = y1 + while y < y2: + end_y = min(y + dash_length, y2) + draw.line([(x2, y), (x2, end_y)], fill=color, width=2) + draw.line([(x2 - 1, y), (x2 - 1, end_y)], fill=color, width=2) + y += dash_length + gap_length + + # Draw index overlay if we have index text + if text: + try: + # Get text size + if font: + bbox_text = draw.textbbox((0, 0), text, font=font) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + else: + # Fallback for default font + bbox_text = draw.textbbox((0, 0), text) + text_width = bbox_text[2] - bbox_text[0] + text_height = bbox_text[3] - bbox_text[1] + + # Smart positioning based on element size + padding = 5 + element_width = x2 - x1 + element_height = y2 - y1 + element_area = element_width * element_height + index_box_area = (text_width + padding * 2) * (text_height + padding * 2) + + # Calculate size ratio to determine positioning strategy + size_ratio = element_area / max(index_box_area, 1) + + if size_ratio < 4: + # Very small elements: place outside in bottom-right corner + text_x = x2 + padding + text_y = y2 - text_height + # Ensure it doesn't go off screen + text_x = min(text_x, 1200 - text_width - padding) + text_y = max(text_y, 0) + elif size_ratio < 16: + # Medium elements: place in bottom-right corner inside + text_x = x2 - text_width - padding + text_y = y2 - text_height - padding + else: + # Large elements: place in center + text_x = x1 + (element_width - text_width) // 2 + text_y = y1 + (element_height - text_height) // 2 + + # Ensure text stays within bounds + text_x = max(0, min(text_x, 1200 - text_width)) + text_y = max(0, min(text_y, 800 - text_height)) + + # Draw background rectangle for maximum contrast + bg_x1 = text_x - padding + bg_y1 = text_y - padding + bg_x2 = text_x + text_width + padding + bg_y2 = text_y + text_height + padding + + # Use white background with thick black border for maximum visibility + draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill='white', outline='black', width=2) + + # Draw bold dark text on light background for best contrast + draw.text((text_x, text_y), text, fill='black', font=font) + + except Exception as e: + logger.debug(f'Failed to draw text overlay: {e}') + + +def process_element_highlight( + element_id: int, + element, + draw, + device_pixel_ratio: float, + font, + filter_highlight_ids: bool, + image_size: tuple[int, int], +) -> None: + """Process a single element for highlighting.""" + try: + # Use absolute_position coordinates directly + if not element.absolute_position: + return + + bounds = element.absolute_position + + # Scale coordinates from CSS pixels to device pixels for screenshot + # The screenshot is captured at device pixel resolution, but coordinates are in CSS pixels + x1 = int(bounds.x * device_pixel_ratio) + y1 = int(bounds.y * device_pixel_ratio) + x2 = int((bounds.x + bounds.width) * device_pixel_ratio) + y2 = int((bounds.y + bounds.height) * device_pixel_ratio) + + # Ensure coordinates are within image bounds + img_width, img_height = image_size + x1 = max(0, min(x1, img_width)) + y1 = max(0, min(y1, img_height)) + x2 = max(x1, min(x2, img_width)) + y2 = max(y1, min(y2, img_height)) + + # Skip if bounding box is too small or invalid + if x2 - x1 < 2 or y2 - y1 < 2: + return + + # Get element color based on type + tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div' + element_type = None + if hasattr(element, 'attributes') and element.attributes: + element_type = element.attributes.get('type') + + color = get_element_color(tag_name, element_type) + + # Get element index for overlay and apply filtering + element_index = getattr(element, 'element_index', None) + index_text = None + + if element_index is not None: + if filter_highlight_ids: + # Use the meaningful text that matches what the LLM sees + meaningful_text = element.get_meaningful_text_for_llm() + # Show ID only if meaningful text is less than 5 characters + if len(meaningful_text) < 5: + index_text = str(element_index) + else: + # Always show ID when filter is disabled + index_text = str(element_index) + + # Draw enhanced bounding box with bigger index + draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name, image_size) + + except Exception as e: + logger.debug(f'Failed to draw highlight for element {element_id}: {e}') + + +@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot') +@time_execution_async('create_highlighted_screenshot') +async def create_highlighted_screenshot( + screenshot_b64: str, + selector_map: DOMSelectorMap, + device_pixel_ratio: float = 1.0, + viewport_offset_x: int = 0, + viewport_offset_y: int = 0, + filter_highlight_ids: bool = True, +) -> str: + """Create a highlighted screenshot with bounding boxes around interactive elements. + + Args: + screenshot_b64: Base64 encoded screenshot + selector_map: Map of interactive elements with their positions + device_pixel_ratio: Device pixel ratio for scaling coordinates + viewport_offset_x: X offset for viewport positioning + viewport_offset_y: Y offset for viewport positioning + + Returns: + Base64 encoded highlighted screenshot + """ + try: + # Decode screenshot + screenshot_data = base64.b64decode(screenshot_b64) + image = Image.open(io.BytesIO(screenshot_data)).convert('RGBA') + + # Create drawing context + draw = ImageDraw.Draw(image) + + # Try to load a font, fall back to default if not available + font = None + try: + font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12) + except OSError: + try: + font = ImageFont.truetype('arial.ttf', 12) + except OSError: + font = None # Use default font + + # Process elements sequentially to avoid ImageDraw thread safety issues + # PIL ImageDraw is not thread-safe, so we process elements one by one + for element_id, element in selector_map.items(): + process_element_highlight(element_id, element, draw, device_pixel_ratio, font, filter_highlight_ids, image.size) + + # Convert back to base64 + output_buffer = io.BytesIO() + image.save(output_buffer, format='PNG') + output_buffer.seek(0) + + highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8') + + logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements') + return highlighted_b64 + + except Exception as e: + logger.error(f'Failed to create highlighted screenshot: {e}') + # Return original screenshot on error + return screenshot_b64 + + +async def get_viewport_info_from_cdp(cdp_session) -> tuple[float, int, int]: + """Get viewport information from CDP session. + + Returns: + Tuple of (device_pixel_ratio, scroll_x, scroll_y) + """ + try: + # Get layout metrics which includes viewport info and device pixel ratio + metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id) + + # Extract viewport information + visual_viewport = metrics.get('visualViewport', {}) + css_visual_viewport = metrics.get('cssVisualViewport', {}) + css_layout_viewport = metrics.get('cssLayoutViewport', {}) + + # Calculate device pixel ratio + css_width = css_visual_viewport.get('clientWidth', css_layout_viewport.get('clientWidth', 1280.0)) + device_width = visual_viewport.get('clientWidth', css_width) + device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0 + + # Get scroll position in CSS pixels + scroll_x = int(css_visual_viewport.get('pageX', 0)) + scroll_y = int(css_visual_viewport.get('pageY', 0)) + + return float(device_pixel_ratio), scroll_x, scroll_y + + except Exception as e: + logger.debug(f'Failed to get viewport info from CDP: {e}') + return 1.0, 0, 0 + + +@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot_async') +@time_execution_async('create_highlighted_screenshot_async') +async def create_highlighted_screenshot_async( + screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None, filter_highlight_ids: bool = True +) -> str: + """Async wrapper for creating highlighted screenshots. + + Args: + screenshot_b64: Base64 encoded screenshot + selector_map: Map of interactive elements + cdp_session: CDP session for getting viewport info + + Returns: + Base64 encoded highlighted screenshot + """ + # Get viewport information if CDP session is available + device_pixel_ratio = 1.0 + viewport_offset_x = 0 + viewport_offset_y = 0 + + if cdp_session: + try: + device_pixel_ratio, viewport_offset_x, viewport_offset_y = await get_viewport_info_from_cdp(cdp_session) + except Exception as e: + logger.debug(f'Failed to get viewport info from CDP: {e}') + + # Create highlighted screenshot with async processing + return await create_highlighted_screenshot( + screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y, filter_highlight_ids + ) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 9c0b180f7..5e6e9d578 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -39,7 +39,8 @@ from browser_use.browser.events import ( from browser_use.browser.profile import BrowserProfile, ProxySettings from browser_use.browser.views import BrowserStateSummary, TabInfo from browser_use.dom.views import EnhancedDOMTreeNode, TargetInfo -from browser_use.utils import _log_pretty_url, is_new_tab_page +from browser_use.observability import observe_debug +from browser_use.utils import _log_pretty_url, is_new_tab_page, time_execution_async DEFAULT_BROWSER_PROFILE = BrowserProfile() @@ -264,6 +265,7 @@ class BrowserSession(BaseModel): wait_for_network_idle_page_load_time: float | None = None, wait_between_actions: float | None = None, highlight_elements: bool | None = None, + filter_highlight_ids: bool | None = None, auto_download_pdfs: bool | None = None, profile_directory: str | None = None, ): @@ -536,6 +538,18 @@ class BrowserSession(BaseModel): target_id = None + # If new_tab=True but we're already in a new tab, set new_tab=False + if event.new_tab: + try: + current_url = await self.get_current_page_url() + from browser_use.utils import is_new_tab_page + + if is_new_tab_page(current_url): + self.logger.debug(f'[on_NavigateToUrlEvent] Already in new tab ({current_url}), setting new_tab=False') + event.new_tab = False + except Exception as e: + self.logger.debug(f'[on_NavigateToUrlEvent] Could not check current URL: {e}') + # check if the url is already open in a tab somewhere that we're not currently on, if so, short-circuit and just switch to it targets = await self._cdp_get_all_pages() for target in targets: @@ -584,10 +598,18 @@ class BrowserSession(BaseModel): # Use current tab target_id = target_id or self.agent_focus.target_id - # Activate target (bring to foreground) - await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id)) - # which does this for us: - # self.agent_focus = await self.get_or_create_cdp_session(target_id) + # Only switch tab if we're not already on the target tab + if self.agent_focus is None or self.agent_focus.target_id != target_id: + self.logger.debug( + f'[on_NavigateToUrlEvent] Switching to target tab {target_id[-4:]} (current: {self.agent_focus.target_id[-4:] if self.agent_focus else "none"})' + ) + # Activate target (bring to foreground) + await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id)) + # which does this for us: + # self.agent_focus = await self.get_or_create_cdp_session(target_id) + else: + self.logger.debug(f'[on_NavigateToUrlEvent] Already on target tab {target_id[-4:]}, skipping SwitchTabEvent') + assert self.agent_focus is not None and self.agent_focus.target_id == target_id, ( 'Agent focus not updated to new target_id after SwitchTabEvent should have switched to it' ) @@ -605,8 +627,8 @@ class BrowserSession(BaseModel): session_id=self.agent_focus.session_id, ) - # Wait a bit to ensure page starts loading - await asyncio.sleep(0.5) + # # Wait a bit to ensure page starts loading + # await asyncio.sleep(0.5) # Dispatch navigation complete self.logger.debug(f'Dispatching NavigationCompleteEvent for {event.url} (tab #{target_id[-4:]})') @@ -678,8 +700,8 @@ class BrowserSession(BaseModel): """Handle tab closure - update focus if needed.""" cdp_session = await self.get_or_create_cdp_session(target_id=None, focus=False) - await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id}) await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id)) + await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id}) async def on_TabClosedEvent(self, event: TabClosedEvent) -> None: """Handle tab closure - update focus if needed.""" @@ -791,6 +813,8 @@ class BrowserSession(BaseModel): assert self._cdp_client_root is not None, 'CDP client not initialized - browser may not be connected yet' return self._cdp_client_root + @time_execution_async('get_or_create_cdp_session') + @observe_debug(ignore_input=True, ignore_output=True, name='get_or_create_cdp_session') async def get_or_create_cdp_session( self, target_id: TargetID | None = None, focus: bool = True, new_socket: bool | None = None ) -> CDPSession: @@ -845,6 +869,8 @@ class BrowserSession(BaseModel): cdp_url=self.cdp_url if should_use_new_socket else None, ) self._cdp_session_pool[target_id] = session + # log length of _cdp_session_pool + self.logger.debug(f'[get_or_create_cdp_session] new _cdp_session_pool length: {len(self._cdp_session_pool)}') # Only change agent focus if requested if focus: @@ -870,7 +896,7 @@ class BrowserSession(BaseModel): return self.agent_focus.session_id if self.agent_focus else None # ========== Helper Methods ========== - + @observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_summary') async def get_browser_state_summary( self, cache_clickable_elements_hashes: bool = True, @@ -1321,6 +1347,7 @@ class BrowserSession(BaseModel): except Exception as e: self.logger.debug(f'Skipping proxy auth setup: {type(e).__name__}: {e}') + @observe_debug(ignore_input=True, ignore_output=True, name='get_tabs') async def get_tabs(self) -> list[TabInfo]: """Get information about all open tabs using CDP Target.getTargetInfo for speed.""" tabs = [] @@ -1399,6 +1426,7 @@ class BrowserSession(BaseModel): return target return None + @observe_debug(ignore_input=True, ignore_output=True, name='get_current_page_url') async def get_current_page_url(self) -> str: """Get the URL of the current page using CDP.""" target = await self.get_current_target_info() @@ -1519,6 +1547,9 @@ class BrowserSession(BaseModel): async def remove_highlights(self) -> None: """Remove highlights from the page using CDP.""" + if not self.browser_profile.highlight_elements: + return + try: # Get cached session cdp_session = await self.get_or_create_cdp_session() diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py index 355c542ff..f9d5b4529 100644 --- a/browser_use/browser/views.py +++ b/browser_use/browser/views.py @@ -126,17 +126,42 @@ class BrowserStateHistory: class BrowserError(Exception): - """Base class for all browser errors""" + """Browser error with structured memory for LLM context management. + + This exception class provides separate memory contexts for browser actions: + - short_term_memory: Immediate context shown once to the LLM for the next action + - long_term_memory: Persistent error information stored across steps + """ message: str + short_term_memory: str | None = None + long_term_memory: str | None = None details: dict[str, Any] | None = None while_handling_event: BaseEvent[Any] | None = None - def __init__(self, message: str, details: dict[str, Any] | None = None, event: BaseEvent[Any] | None = None): + def __init__( + self, + message: str, + short_term_memory: str | None = None, + long_term_memory: str | None = None, + details: dict[str, Any] | None = None, + event: BaseEvent[Any] | None = None, + ): + """Initialize a BrowserError with structured memory contexts. + + Args: + message: Technical error message for logging and debugging + short_term_memory: Context shown once to LLM (e.g., available actions, options) + long_term_memory: Persistent error info stored in agent memory + details: Additional metadata for debugging + event: The browser event that triggered this error + """ self.message = message - super().__init__(message) + self.short_term_memory = short_term_memory + self.long_term_memory = long_term_memory self.details = details self.while_handling_event = event + super().__init__(message) def __str__(self) -> str: if self.details: diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py index 476b629a4..332857e1e 100644 --- a/browser_use/browser/watchdogs/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -3,7 +3,6 @@ import asyncio import json import platform -from typing import Any from browser_use.browser.events import ( ClickElementEvent, @@ -58,7 +57,8 @@ class DefaultActionWatchdog(BaseWatchdog): msg = f'Index {index_for_logging} - has an element which opens file upload dialog. To upload files please use a specific function to upload files' self.logger.info(msg) raise BrowserError( - 'Click triggered a file input element which could not be handled, use the dedicated file upload function instead' + message=msg, + long_term_memory=msg, ) # Perform the actual click using internal implementation @@ -227,16 +227,18 @@ class DefaultActionWatchdog(BaseWatchdog): element_type = element_node.attributes.get('type', '').lower() if element_node.attributes else '' if tag_name == 'select': - self.logger.warning( - f'Cannot click on elements. Use get_dropdown_options(index={element_node.element_index}) action instead.' + msg = f'Cannot click on elements.' in str(e): try: return await get_dropdown_options( params=GetDropdownOptionsAction(index=params.index), browser_session=browser_session @@ -311,6 +296,9 @@ class Tools(Generic[Context]): f'Failed to get dropdown options as shortcut during click_element_by_index on dropdown: {type(dropdown_error).__name__}: {dropdown_error}' ) + return handle_browser_error(e) + except Exception as e: + error_msg = f'Failed to click element {params.index}: {str(e)}' return ActionResult(error=error_msg) @self.registry.action( @@ -336,10 +324,11 @@ class Tools(Generic[Context]): # Include input coordinates in metadata if available return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f"Input '{params.text}' into element {params.index}.", metadata=input_metadata if isinstance(input_metadata, dict) else None, ) + except BrowserError as e: + return handle_browser_error(e) except Exception as e: # Log the full error for debugging logger.error(f'Failed to dispatch TypeTextEvent: {type(e).__name__}: {e}') @@ -370,27 +359,28 @@ class Tools(Generic[Context]): if not browser_session.is_local: pass else: - raise BrowserError( - f'File path {params.path} is not available. Must be in available_file_paths, downloaded_files, or a file managed by file_system.' - ) + msg = f'File path {params.path} is not available. Upload files must be in available_file_paths, downloaded_files, or a file managed by file_system.' + logger.error(f'โŒ {msg}') + return ActionResult(error=msg) else: # If browser is remote, allow passing a remote-accessible absolute path if not browser_session.is_local: pass else: - raise BrowserError( - f'File path {params.path} is not available. Must be in available_file_paths or downloaded_files.' - ) + msg = f'File path {params.path} is not available. Upload files must be in available_file_paths, downloaded_files, or a file managed by file_system.' + raise BrowserError(message=msg, long_term_memory=msg) # For local browsers, ensure the file exists on the local filesystem if browser_session.is_local: if not os.path.exists(params.path): - raise BrowserError(f'File {params.path} does not exist') + msg = f'File {params.path} does not exist' + return ActionResult(error=msg) # Get the selector map to find the node selector_map = await browser_session.get_selector_map() if params.index not in selector_map: - raise BrowserError(f'Element with index {params.index} not found in selector map') + msg = f'Element with index {params.index} does not exist.' + return ActionResult(error=msg) node = selector_map[params.index] @@ -486,7 +476,6 @@ class Tools(Generic[Context]): logger.info(f'๐Ÿ“ {msg}') return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f'Uploaded file {params.path} to element {params.index}', ) except Exception as e: @@ -499,12 +488,7 @@ class Tools(Generic[Context]): async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession): # Dispatch switch tab event try: - if params.tab_id: - target_id = await browser_session.get_target_id_from_tab_id(params.tab_id) - elif params.url: - target_id = await browser_session.get_target_id_from_url(params.url) - else: - target_id = await browser_session.get_most_recently_opened_target_id() + target_id = await browser_session.get_target_id_from_tab_id(params.tab_id) event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=target_id)) await event @@ -512,11 +496,10 @@ class Tools(Generic[Context]): assert new_target_id, 'SwitchTabEvent did not return a TargetID for the new tab that was switched to' memory = f'Switched to Tab with ID {new_target_id[-4:]}' logger.info(f'๐Ÿ”„ {memory}') - return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: logger.error(f'Failed to switch tab: {type(e).__name__}: {e}') - clean_msg = extract_llm_error_message(e) - return ActionResult(error=f'Failed to switch to tab {params.tab_id or params.url}: {clean_msg}') + return ActionResult(error=f'Failed to switch to tab {params.tab_id}.') @self.registry.action('Close an existing tab', param_model=CloseTabAction) async def close_tab(params: CloseTabAction, browser_session: BrowserSession): @@ -535,13 +518,11 @@ class Tools(Generic[Context]): logger.info(f'๐Ÿ—‘๏ธ {memory}') return ActionResult( extracted_content=memory, - include_in_memory=True, long_term_memory=memory, ) except Exception as e: logger.error(f'Failed to close tab: {e}') - clean_msg = extract_llm_error_message(e) - return ActionResult(error=f'Failed to close tab {params.tab_id}: {clean_msg}') + return ActionResult(error=f'Failed to close tab {params.tab_id}.') # Content Actions @@ -697,11 +678,10 @@ Provide the extracted information in a clear, structured format.""" msg = f'๐Ÿ” {long_term_memory}' logger.info(msg) - return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=long_term_memory) + return ActionResult(extracted_content=msg, long_term_memory=long_term_memory) except Exception as e: logger.error(f'Failed to dispatch ScrollEvent: {type(e).__name__}: {e}') - clean_msg = extract_llm_error_message(e) - error_msg = f'Failed to scroll: {clean_msg}' + error_msg = 'Failed to execute scroll action.' return ActionResult(error=error_msg) @self.registry.action( @@ -717,11 +697,10 @@ Provide the extracted information in a clear, structured format.""" memory = f'Sent keys: {params.keys}' msg = f'โŒจ๏ธ {memory}' logger.info(msg) - return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: logger.error(f'Failed to dispatch SendKeysEvent: {type(e).__name__}: {e}') - clean_msg = extract_llm_error_message(e) - error_msg = f'Failed to send keys: {clean_msg}' + error_msg = f'Failed to send keys: {str(e)}' return ActionResult(error=error_msg) @self.registry.action( @@ -737,14 +716,13 @@ Provide the extracted information in a clear, structured format.""" memory = f'Scrolled to text: {text}' msg = f'๐Ÿ” {memory}' logger.info(msg) - return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory) + return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: # Text not found msg = f"Text '{text}' not found or not visible on page" logger.info(msg) return ActionResult( extracted_content=msg, - include_in_memory=True, long_term_memory=f"Tried scrolling to text '{text}' but it was not found", ) @@ -762,7 +740,6 @@ Provide the extracted information in a clear, structured format.""" raise ValueError(f'Element index {params.index} not found in DOM') # Dispatch GetDropdownOptionsEvent to the event handler - import json event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node)) dropdown_data = await event.event_result(timeout=3.0, raise_if_none=True, raise_if_any=True) @@ -770,14 +747,10 @@ Provide the extracted information in a clear, structured format.""" if not dropdown_data: raise ValueError('Failed to get dropdown options - no data returned') - # Extract the message from the returned data - msg = dropdown_data.get('message', '') - options_count = len(json.loads(dropdown_data.get('options', '[]'))) # Parse the string back to list to get count - + # Use structured memory from the handler return ActionResult( - extracted_content=msg, - include_in_memory=True, - long_term_memory=f'Found {options_count} dropdown options for index {params.index}', + extracted_content=dropdown_data['short_term_memory'], + long_term_memory=dropdown_data['long_term_memory'], include_extracted_content_only_once=True, ) @@ -801,14 +774,28 @@ Provide the extracted information in a clear, structured format.""" if not selection_data: raise ValueError('Failed to select dropdown option - no data returned') - # Extract the message from the returned data - msg = selection_data.get('message', f'Selected option: {params.text}') - - return ActionResult( - extracted_content=msg, - include_in_memory=True, - long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}", - ) + # Check if the selection was successful + if selection_data.get('success') == 'true': + # Extract the message from the returned data + msg = selection_data.get('message', f'Selected option: {params.text}') + return ActionResult( + extracted_content=msg, + include_in_memory=True, + long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}", + ) + else: + # Handle structured error response + # TODO: raise BrowserError instead of returning ActionResult + if 'short_term_memory' in selection_data and 'long_term_memory' in selection_data: + return ActionResult( + extracted_content=selection_data['short_term_memory'], + long_term_memory=selection_data['long_term_memory'], + include_extracted_content_only_once=True, + ) + else: + # Fallback to regular error + error_msg = selection_data.get('error', f'Failed to select option: {params.text}') + return ActionResult(error=error_msg) # File System Actions @self.registry.action( @@ -831,7 +818,7 @@ Provide the extracted information in a clear, structured format.""" else: result = await file_system.write_file(file_name, content) logger.info(f'๐Ÿ’พ {result}') - return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result) + return ActionResult(extracted_content=result, long_term_memory=result) @self.registry.action( 'Replace old_str with new_str in file_name. old_str must exactly match the string to replace in original text. Recommended tool to mark completed items in todo.md or change specific contents in a file.' @@ -839,7 +826,7 @@ Provide the extracted information in a clear, structured format.""" async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem): result = await file_system.replace_file_str(file_name, old_str, new_str) logger.info(f'๐Ÿ’พ {result}') - return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result) + return ActionResult(extracted_content=result, long_term_memory=result) @self.registry.action('Read file_name from file system') async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem): @@ -866,7 +853,6 @@ Provide the extracted information in a clear, structured format.""" logger.info(f'๐Ÿ’พ {memory}') return ActionResult( extracted_content=result, - include_in_memory=True, long_term_memory=memory, include_extracted_content_only_once=True, ) @@ -1001,12 +987,16 @@ Provide the extracted information in a clear, structured format.""" sensitive_data=sensitive_data, available_file_paths=available_file_paths, ) + except BrowserError as e: + logger.error(f'โŒ Action {action_name} failed with BrowserError: {str(e)}') + result = handle_browser_error(e) + except TimeoutError as e: + logger.error(f'โŒ Action {action_name} failed with TimeoutError: {str(e)}') + result = ActionResult(error=f'{action_name} was not executed due to timeout.') except Exception as e: # Log the original exception with traceback for observability - logger.error(f"Action '{action_name}' failed") - # Extract clean error message from llm_error_msg tags if present - clean_msg = extract_llm_error_message(e) - result = ActionResult(error=clean_msg) + logger.error(f"Action '{action_name}' failed with error: {str(e)}") + result = ActionResult(error=str(e)) if Laminar is not None: Laminar.set_span_output(result) diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index a4d7a678b..c25c47c2a 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -43,15 +43,10 @@ class StructuredOutputAction(BaseModel, Generic[T]): class SwitchTabAction(BaseModel): - url: str | None = Field( - default=None, - description='URL or URL substring of the tab to switch to, if not provided, the tab_id or most recently opened tab will be used', - ) - tab_id: str | None = Field( - default=None, + tab_id: str = Field( min_length=4, max_length=4, - description='exact 4 character Tab ID to match instead of URL, prefer using this if known', + description='Last 4 chars of TargetID', ) # last 4 chars of TargetID diff --git a/docs/docs.json b/docs/docs.json index f6fb21e77..e7a146b4e 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -129,31 +129,6 @@ { "tab": "Cloud", "versions": [ - { - "version": "v2", - "groups": [ - { - "group": "Get Started", - "pages": [ - "cloud/v2/quickstart", - "cloud/v2/python-quickstart", - "cloud/v2/node-quickstart" - ] - }, - { - "group": "Platform", - "pages": [ - "cloud/v1/pricing", - "cloud/v1/n8n-browser-use-integration", - "cloud/v1/search" - ] - }, - { - "group": "REST API reference", - "openapi": "https://app.stainless.com/api/spec/documented/browser-use/openapi.documented.yml" - } - ] - }, { "version": "v1", "groups": [ @@ -180,6 +155,27 @@ "openapi": "https://api.browser-use.com/api/v1/openapi.json" } ] + }, + { + "version": "v2", + "groups": [ + { + "group": "Get Started", + "pages": [ + "cloud/v2/quickstart", + "cloud/v2/python-quickstart", + "cloud/v2/node-quickstart" + ] + }, + { + "group": "Platform", + "pages": [ + "cloud/v1/pricing", + "cloud/v1/n8n-browser-use-integration", + "cloud/v1/search" + ] + } + ] } ] } diff --git a/pyproject.toml b/pyproject.toml index 9e33f959c..22a067623 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "aiofiles>=24.1.0", "aiohttp==3.12.15", "anyio>=4.9.0", - "bubus>=1.5.4", + "bubus>=1.5.6", "google-api-core>=2.25.0", "httpx>=0.28.1", "markdownify==1.1.0", diff --git a/tests/ci/evaluate_tasks.py b/tests/ci/evaluate_tasks.py index c02e15506..435ba38dd 100644 --- a/tests/ci/evaluate_tasks.py +++ b/tests/ci/evaluate_tasks.py @@ -17,11 +17,7 @@ import aiofiles import yaml from pydantic import BaseModel -from browser_use.agent.service import Agent -from browser_use.agent.views import AgentHistoryList -from browser_use.browser.profile import BrowserProfile -from browser_use.browser.session import BrowserSession -from browser_use.llm import ChatOpenAI +from browser_use import Agent, AgentHistoryList, BrowserProfile, BrowserSession, ChatOpenAI from browser_use.llm.messages import UserMessage # --- CONFIG --- diff --git a/tests/ci/test_browser_event_ClickElementEvent.py b/tests/ci/test_browser_event_ClickElementEvent.py index 97553e580..08265ac98 100644 --- a/tests/ci/test_browser_event_ClickElementEvent.py +++ b/tests/ci/test_browser_event_ClickElementEvent.py @@ -185,11 +185,11 @@ class TestClickElementEvent: # Verify the result structure assert isinstance(result, ActionResult), 'Result should be an ActionResult instance' assert result.error is None, f'Expected no error but got: {result.error}' - + result_text = result.extracted_content or result.long_term_memory # Core logic validation: Verify click was successful - assert result.extracted_content is not None - assert f'Clicked element with index {button_index}' in result.extracted_content, ( - f'Expected click confirmation in result content, got: {result.extracted_content}' + assert result_text is not None + assert f'Clicked element with index {button_index}' in result_text, ( + f'Expected click confirmation in result content, got: {result_text}' ) # Note: The click action doesn't include button text in the result, only the index @@ -260,7 +260,11 @@ class TestClickElementEvent: # Verify the result assert isinstance(result, ActionResult) - assert result.extracted_content is not None + result_text = result.extracted_content or result.long_term_memory + assert result_text is not None + assert f'Clicked element with index {link_index}' in result_text, ( + f'Expected click confirmation in result content, got: {result_text}' + ) # Verify that a new tab was opened tabs = await browser_session.get_tabs() diff --git a/tests/ci/test_browser_event_ScrollEvent.py b/tests/ci/test_browser_event_ScrollEvent.py index b716969ca..e8767384f 100644 --- a/tests/ci/test_browser_event_ScrollEvent.py +++ b/tests/ci/test_browser_event_ScrollEvent.py @@ -104,7 +104,6 @@ class TestScrollActions: assert result.extracted_content is not None assert 'Scrolled down' in result.extracted_content assert 'the page' in result.extracted_content - assert result.include_in_memory is True # Test 2: Basic page scroll up scroll_up_action = {'scroll': ScrollAction(down=False, num_pages=0.5)} @@ -123,7 +122,7 @@ class TestScrollActions: # This should fail with error about element not found assert isinstance(result, ActionResult) assert result.error is not None, 'Expected error for invalid element index' - assert 'Element index 999 not found' in result.error or 'Failed to scroll' in result.error + assert 'Element index 999 not found' in result.error or 'Failed to execute scroll' in result.error # Test 4: Model parameter validation scroll_with_index = ScrollAction(down=True, num_pages=1.0, frame_element_index=5) diff --git a/tests/ci/test_browser_watchdog_screenshots.py b/tests/ci/test_browser_watchdog_screenshots.py index e73cdf09f..001493bf1 100644 --- a/tests/ci/test_browser_watchdog_screenshots.py +++ b/tests/ci/test_browser_watchdog_screenshots.py @@ -394,10 +394,10 @@ class TestScreenshotEventSystem: # Test the NEW event-driven path: direct event dispatching event = browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False)) - screenshot_result = (await event.event_result()) or {} - assert screenshot_result.get('screenshot') - assert isinstance(screenshot_result['screenshot'], str) - assert len(base64.b64decode(screenshot_result['screenshot'])) > 5000 + screenshot_b64 = await event.event_result() + assert screenshot_b64 is not None + assert isinstance(screenshot_b64, str) + assert len(base64.b64decode(screenshot_b64)) > 5000 finally: await browser_session.kill()