mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
Merge branch 'main' into os-font-fix
This commit is contained in:
5
.github/workflows/test.yaml
vendored
5
.github/workflows/test.yaml
vendored
@@ -172,6 +172,11 @@ jobs:
|
||||
const score = `${passed}/${total}`;
|
||||
const percentage = Math.round((passed / total) * 100);
|
||||
|
||||
// Fail the workflow if 0% pass rate
|
||||
if (percentage === 0) {
|
||||
core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`);
|
||||
}
|
||||
|
||||
// Create detailed table
|
||||
let tableRows = '';
|
||||
detailedResults.forEach(result => {
|
||||
|
||||
@@ -212,10 +212,16 @@ class MessageManager:
|
||||
|
||||
# Build the history item
|
||||
if model_output is None:
|
||||
# Only add error history item if we have a valid step number
|
||||
if step_number is not None and step_number > 0:
|
||||
history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
|
||||
self.state.agent_history_items.append(history_item)
|
||||
# Add history item for initial actions (step 0) or errors (step > 0)
|
||||
if step_number is not None:
|
||||
if step_number == 0 and action_results:
|
||||
# Step 0 with initial action results
|
||||
history_item = HistoryItem(step_number=step_number, action_results=action_results)
|
||||
self.state.agent_history_items.append(history_item)
|
||||
elif step_number > 0:
|
||||
# Error case for steps > 0
|
||||
history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
|
||||
self.state.agent_history_items.append(history_item)
|
||||
else:
|
||||
history_item = HistoryItem(
|
||||
step_number=step_number,
|
||||
|
||||
@@ -270,8 +270,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
# Action setup
|
||||
self._setup_action_models()
|
||||
self._set_browser_use_version_and_source(source)
|
||||
self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
|
||||
|
||||
initial_url = None
|
||||
|
||||
# only load url if no initial actions are provided
|
||||
if self.directly_open_url and not self.state.follow_up_task and not initial_actions:
|
||||
initial_url = self._extract_url_from_task(self.task)
|
||||
if initial_url:
|
||||
self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...')
|
||||
initial_actions = [{'go_to_url': {'url': initial_url, 'new_tab': False}}]
|
||||
|
||||
self.initial_url = initial_url
|
||||
|
||||
self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
|
||||
# Verify we can connect to the model
|
||||
self._verify_and_setup_llm()
|
||||
|
||||
@@ -588,7 +599,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
if hasattr(self, 'cloud_sync') and self.cloud_sync and self.enable_cloud_sync:
|
||||
self.eventbus.on('*', self.cloud_sync.handle_event)
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='_raise_if_stopped_or_paused')
|
||||
async def _raise_if_stopped_or_paused(self) -> None:
|
||||
"""Utility function that raises an InterruptedError if the agent is stopped or paused."""
|
||||
|
||||
@@ -635,14 +645,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
|
||||
self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
|
||||
# Always take screenshots for all steps
|
||||
# Use caching based on directly_open_url setting - if directly_open_url is False, don't use cached state
|
||||
is_first_step = self.state.n_steps in (0, 1)
|
||||
use_cache = is_first_step and self.directly_open_url
|
||||
self.logger.debug(f'📸 Requesting browser state with include_screenshot=True, cached={use_cache}')
|
||||
self.logger.debug('📸 Requesting browser state with include_screenshot=True')
|
||||
browser_state_summary = await self.browser_session.get_browser_state_summary(
|
||||
cache_clickable_elements_hashes=True,
|
||||
include_screenshot=True, # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway)
|
||||
cached=use_cache,
|
||||
include_recent_events=self.include_recent_events,
|
||||
)
|
||||
if browser_state_summary.screenshot:
|
||||
@@ -1160,7 +1166,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
unique_urls = list(set(found_urls))
|
||||
# If multiple URLs found, skip directly_open_urling
|
||||
if len(unique_urls) > 1:
|
||||
self.logger.debug(f'📍 Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity')
|
||||
self.logger.debug(f'Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity')
|
||||
return None
|
||||
|
||||
# If exactly one URL found, return it
|
||||
@@ -1239,45 +1245,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
|
||||
self.logger.debug('🔧 Browser session started with watchdogs attached')
|
||||
|
||||
# Check if task contains a URL and add it as an initial action (only if directly_open_url is enabled)
|
||||
if self.directly_open_url and not self.state.follow_up_task:
|
||||
initial_url = self._extract_url_from_task(self.task)
|
||||
if initial_url:
|
||||
self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...')
|
||||
# Ensure browser focus is properly established before executing initial actions
|
||||
if self.browser_session and self.browser_session.agent_focus:
|
||||
self.logger.debug(f'🎯 Browser focus established on target: {self.browser_session.agent_focus.target_id[-4:]}')
|
||||
else:
|
||||
self.logger.warning('⚠️ No browser focus established, may cause navigation issues')
|
||||
|
||||
# Create a go_to_url action for the initial URL
|
||||
go_to_url_action = {
|
||||
'go_to_url': {
|
||||
'url': initial_url,
|
||||
'new_tab': False, # Navigate in current tab
|
||||
}
|
||||
}
|
||||
|
||||
# Add to initial_actions or create new list if none exist
|
||||
if self.initial_actions:
|
||||
# Convert back to dict format, prepend URL navigation, then convert back
|
||||
initial_actions_dicts = []
|
||||
for action in self.initial_actions:
|
||||
action_data = action.model_dump(exclude_unset=True)
|
||||
initial_actions_dicts.append(action_data)
|
||||
|
||||
# Prepend the go_to_url action
|
||||
initial_actions_dicts = [go_to_url_action] + initial_actions_dicts
|
||||
|
||||
# Convert back to ActionModel instances
|
||||
self.initial_actions = self._convert_initial_actions(initial_actions_dicts)
|
||||
else:
|
||||
# Create new initial_actions with just the go_to_url
|
||||
self.initial_actions = self._convert_initial_actions([go_to_url_action])
|
||||
|
||||
self.logger.debug(f'✅ Added navigation to {initial_url} as initial action')
|
||||
|
||||
# Execute initial actions if provided
|
||||
if self.initial_actions and not self.state.follow_up_task:
|
||||
self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...')
|
||||
result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
|
||||
self.state.last_result = result
|
||||
self.logger.debug('✅ Initial actions completed')
|
||||
await self._execute_initial_actions()
|
||||
|
||||
self.logger.debug(f'🔄 Starting main execution loop with max {max_steps} steps...')
|
||||
for step in range(max_steps):
|
||||
@@ -1519,6 +1493,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
new_element_hashes = {e.parent_branch_hash() for e in new_selector_map.values()}
|
||||
if check_for_new_elements and not new_element_hashes.issubset(cached_element_hashes):
|
||||
# next action requires index but there are new elements on the page
|
||||
# log difference in len debug
|
||||
self.logger.debug(f'New elements: {abs(len(new_element_hashes) - len(cached_element_hashes))}')
|
||||
remaining_actions_str = get_remaining_actions_str(actions, i)
|
||||
msg = f'Something new appeared after action {i} / {total_actions}: actions {remaining_actions_str} were not executed'
|
||||
logger.info(msg)
|
||||
@@ -1653,6 +1629,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
|
||||
return results
|
||||
|
||||
async def _execute_initial_actions(self) -> None:
|
||||
# Execute initial actions if provided
|
||||
if self.initial_actions and not self.state.follow_up_task:
|
||||
self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...')
|
||||
result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
|
||||
# update result 1 to mention that its was automatically loaded
|
||||
if result and self.initial_url and result[0].long_term_memory:
|
||||
result[0].long_term_memory = f'Found initial url and automatically loaded it. {result[0].long_term_memory}'
|
||||
self.state.last_result = result
|
||||
self.logger.debug('Initial actions completed')
|
||||
|
||||
async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]:
|
||||
"""Execute a single step from history with element validation"""
|
||||
assert self.browser_session is not None, 'BrowserSession is not set up'
|
||||
|
||||
@@ -583,9 +583,14 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
|
||||
# --- UI/viewport/DOM ---
|
||||
|
||||
highlight_elements: bool = Field(default=True, description='Highlight interactive elements on the page.')
|
||||
filter_highlight_ids: bool = Field(
|
||||
default=True, description='Only show element IDs in highlights if llm_representation is less than 10 characters.'
|
||||
)
|
||||
|
||||
# --- Downloads ---
|
||||
auto_download_pdfs: bool = Field(default=True, description='Automatically download PDFs when navigating to PDF viewer pages.')
|
||||
auto_download_pdfs: bool = Field(
|
||||
default=False, description='Automatically download PDFs when navigating to PDF viewer pages.'
|
||||
)
|
||||
|
||||
profile_directory: str = 'Default' # e.g. 'Profile 1', 'Profile 2', 'Custom Profile', etc.
|
||||
|
||||
|
||||
476
browser_use/browser/python_highlights.py
Normal file
476
browser_use/browser/python_highlights.py
Normal file
@@ -0,0 +1,476 @@
|
||||
"""Python-based highlighting system for drawing bounding boxes on screenshots.
|
||||
|
||||
This module replaces JavaScript-based highlighting with fast Python image processing
|
||||
to draw bounding boxes around interactive elements directly on screenshots.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from browser_use.dom.views import DOMSelectorMap
|
||||
from browser_use.observability import observe_debug
|
||||
from browser_use.utils import time_execution_async
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Color scheme for different element types
|
||||
ELEMENT_COLORS = {
|
||||
'button': '#FF6B6B', # Red for buttons
|
||||
'input': '#4ECDC4', # Teal for inputs
|
||||
'select': '#45B7D1', # Blue for dropdowns
|
||||
'a': '#96CEB4', # Green for links
|
||||
'textarea': '#FF8C42', # Orange for text areas (was yellow, now more visible)
|
||||
'default': '#DDA0DD', # Light purple for other interactive elements
|
||||
}
|
||||
|
||||
# Element type mappings
|
||||
ELEMENT_TYPE_MAP = {
|
||||
'button': 'button',
|
||||
'input': 'input',
|
||||
'select': 'select',
|
||||
'a': 'a',
|
||||
'textarea': 'textarea',
|
||||
}
|
||||
|
||||
|
||||
def get_element_color(tag_name: str, element_type: str | None = None) -> str:
|
||||
"""Get color for element based on tag name and type."""
|
||||
# Check input type first
|
||||
if tag_name == 'input' and element_type:
|
||||
if element_type in ['button', 'submit']:
|
||||
return ELEMENT_COLORS['button']
|
||||
|
||||
# Use tag-based color
|
||||
return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default'])
|
||||
|
||||
|
||||
def should_show_index_overlay(element_index: int | None) -> bool:
|
||||
"""Determine if index overlay should be shown."""
|
||||
return element_index is not None
|
||||
|
||||
|
||||
def draw_enhanced_bounding_box_with_text(
|
||||
draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues
|
||||
bbox: tuple[int, int, int, int],
|
||||
color: str,
|
||||
text: str | None = None,
|
||||
font: ImageFont.FreeTypeFont | None = None,
|
||||
element_type: str = 'div',
|
||||
image_size: tuple[int, int] = (2000, 1500),
|
||||
) -> None:
|
||||
"""Draw an enhanced bounding box with much bigger index containers and dashed borders."""
|
||||
x1, y1, x2, y2 = bbox
|
||||
|
||||
# Draw dashed bounding box with pattern: 1 line, 2 spaces, 1 line, 2 spaces...
|
||||
dash_length = 4
|
||||
gap_length = 8
|
||||
line_width = 2
|
||||
|
||||
# Helper function to draw dashed line
|
||||
def draw_dashed_line(start_x, start_y, end_x, end_y):
|
||||
if start_x == end_x: # Vertical line
|
||||
y = start_y
|
||||
while y < end_y:
|
||||
dash_end = min(y + dash_length, end_y)
|
||||
draw.line([(start_x, y), (start_x, dash_end)], fill=color, width=line_width)
|
||||
y += dash_length + gap_length
|
||||
else: # Horizontal line
|
||||
x = start_x
|
||||
while x < end_x:
|
||||
dash_end = min(x + dash_length, end_x)
|
||||
draw.line([(x, start_y), (dash_end, start_y)], fill=color, width=line_width)
|
||||
x += dash_length + gap_length
|
||||
|
||||
# Draw dashed rectangle
|
||||
draw_dashed_line(x1, y1, x2, y1) # Top
|
||||
draw_dashed_line(x2, y1, x2, y2) # Right
|
||||
draw_dashed_line(x2, y2, x1, y2) # Bottom
|
||||
draw_dashed_line(x1, y2, x1, y1) # Left
|
||||
|
||||
# Draw much bigger index overlay if we have index text
|
||||
if text:
|
||||
try:
|
||||
# Scale font size based on image dimensions for consistent appearance across viewports
|
||||
img_width, img_height = image_size
|
||||
# Base font size scales with viewport width (36px for 1200px viewport)
|
||||
base_font_size = max(16, min(48, int(img_width * 0.03))) # 3% of viewport width
|
||||
big_font = None
|
||||
try:
|
||||
big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', base_font_size)
|
||||
except OSError:
|
||||
try:
|
||||
big_font = ImageFont.truetype('arial.ttf', base_font_size)
|
||||
except OSError:
|
||||
# Try system fonts on different platforms
|
||||
try:
|
||||
big_font = ImageFont.truetype('Arial Bold.ttf', base_font_size)
|
||||
except OSError:
|
||||
big_font = font # Fallback to original font
|
||||
|
||||
# Get text size with bigger font
|
||||
if big_font:
|
||||
bbox_text = draw.textbbox((0, 0), text, font=big_font)
|
||||
text_width = bbox_text[2] - bbox_text[0]
|
||||
text_height = bbox_text[3] - bbox_text[1]
|
||||
else:
|
||||
# Fallback for default font
|
||||
bbox_text = draw.textbbox((0, 0), text)
|
||||
text_width = bbox_text[2] - bbox_text[0]
|
||||
text_height = bbox_text[3] - bbox_text[1]
|
||||
|
||||
# Scale padding based on viewport size for consistent appearance
|
||||
padding = max(4, int(img_width * 0.005)) # 0.5% of viewport width
|
||||
element_width = x2 - x1
|
||||
element_height = y2 - y1
|
||||
|
||||
# Container dimensions
|
||||
container_width = text_width + padding * 2
|
||||
container_height = text_height + padding * 2
|
||||
|
||||
# Position in top-left corner (inside if fits, outside if too small)
|
||||
if element_width >= container_width and element_height >= container_height:
|
||||
# Place inside top-left corner
|
||||
bg_x1 = x1 + 2 # Small offset from edge
|
||||
bg_y1 = y1 + 2
|
||||
else:
|
||||
# Place outside top-left corner
|
||||
bg_x1 = x1
|
||||
bg_y1 = max(0, y1 - container_height)
|
||||
|
||||
bg_x2 = bg_x1 + container_width
|
||||
bg_y2 = bg_y1 + container_height
|
||||
|
||||
# Center the number within the index box with proper baseline handling
|
||||
text_x = bg_x1 + (container_width - text_width) // 2
|
||||
# Add extra vertical space to prevent clipping
|
||||
text_y = bg_y1 + (container_height - text_height) // 2 - bbox_text[1] # Subtract top offset
|
||||
|
||||
# Ensure container stays within image bounds
|
||||
img_width, img_height = image_size
|
||||
if bg_x1 < 0:
|
||||
offset = -bg_x1
|
||||
bg_x1 += offset
|
||||
bg_x2 += offset
|
||||
text_x += offset
|
||||
if bg_y1 < 0:
|
||||
offset = -bg_y1
|
||||
bg_y1 += offset
|
||||
bg_y2 += offset
|
||||
text_y += offset
|
||||
if bg_x2 > img_width:
|
||||
offset = bg_x2 - img_width
|
||||
bg_x1 -= offset
|
||||
bg_x2 -= offset
|
||||
text_x -= offset
|
||||
if bg_y2 > img_height:
|
||||
offset = bg_y2 - img_height
|
||||
bg_y1 -= offset
|
||||
bg_y2 -= offset
|
||||
text_y -= offset
|
||||
|
||||
# Draw bigger background rectangle with thicker border
|
||||
draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=2)
|
||||
|
||||
# Draw white text centered in the index box
|
||||
draw.text((text_x, text_y), text, fill='white', font=big_font or font)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f'Failed to draw enhanced text overlay: {e}')
|
||||
|
||||
|
||||
def draw_bounding_box_with_text(
|
||||
draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues
|
||||
bbox: tuple[int, int, int, int],
|
||||
color: str,
|
||||
text: str | None = None,
|
||||
font: ImageFont.FreeTypeFont | None = None,
|
||||
) -> None:
|
||||
"""Draw a bounding box with optional text overlay."""
|
||||
x1, y1, x2, y2 = bbox
|
||||
|
||||
# Draw dashed bounding box
|
||||
dash_length = 2
|
||||
gap_length = 6
|
||||
|
||||
# Top edge
|
||||
x = x1
|
||||
while x < x2:
|
||||
end_x = min(x + dash_length, x2)
|
||||
draw.line([(x, y1), (end_x, y1)], fill=color, width=2)
|
||||
draw.line([(x, y1 + 1), (end_x, y1 + 1)], fill=color, width=2)
|
||||
x += dash_length + gap_length
|
||||
|
||||
# Bottom edge
|
||||
x = x1
|
||||
while x < x2:
|
||||
end_x = min(x + dash_length, x2)
|
||||
draw.line([(x, y2), (end_x, y2)], fill=color, width=2)
|
||||
draw.line([(x, y2 - 1), (end_x, y2 - 1)], fill=color, width=2)
|
||||
x += dash_length + gap_length
|
||||
|
||||
# Left edge
|
||||
y = y1
|
||||
while y < y2:
|
||||
end_y = min(y + dash_length, y2)
|
||||
draw.line([(x1, y), (x1, end_y)], fill=color, width=2)
|
||||
draw.line([(x1 + 1, y), (x1 + 1, end_y)], fill=color, width=2)
|
||||
y += dash_length + gap_length
|
||||
|
||||
# Right edge
|
||||
y = y1
|
||||
while y < y2:
|
||||
end_y = min(y + dash_length, y2)
|
||||
draw.line([(x2, y), (x2, end_y)], fill=color, width=2)
|
||||
draw.line([(x2 - 1, y), (x2 - 1, end_y)], fill=color, width=2)
|
||||
y += dash_length + gap_length
|
||||
|
||||
# Draw index overlay if we have index text
|
||||
if text:
|
||||
try:
|
||||
# Get text size
|
||||
if font:
|
||||
bbox_text = draw.textbbox((0, 0), text, font=font)
|
||||
text_width = bbox_text[2] - bbox_text[0]
|
||||
text_height = bbox_text[3] - bbox_text[1]
|
||||
else:
|
||||
# Fallback for default font
|
||||
bbox_text = draw.textbbox((0, 0), text)
|
||||
text_width = bbox_text[2] - bbox_text[0]
|
||||
text_height = bbox_text[3] - bbox_text[1]
|
||||
|
||||
# Smart positioning based on element size
|
||||
padding = 5
|
||||
element_width = x2 - x1
|
||||
element_height = y2 - y1
|
||||
element_area = element_width * element_height
|
||||
index_box_area = (text_width + padding * 2) * (text_height + padding * 2)
|
||||
|
||||
# Calculate size ratio to determine positioning strategy
|
||||
size_ratio = element_area / max(index_box_area, 1)
|
||||
|
||||
if size_ratio < 4:
|
||||
# Very small elements: place outside in bottom-right corner
|
||||
text_x = x2 + padding
|
||||
text_y = y2 - text_height
|
||||
# Ensure it doesn't go off screen
|
||||
text_x = min(text_x, 1200 - text_width - padding)
|
||||
text_y = max(text_y, 0)
|
||||
elif size_ratio < 16:
|
||||
# Medium elements: place in bottom-right corner inside
|
||||
text_x = x2 - text_width - padding
|
||||
text_y = y2 - text_height - padding
|
||||
else:
|
||||
# Large elements: place in center
|
||||
text_x = x1 + (element_width - text_width) // 2
|
||||
text_y = y1 + (element_height - text_height) // 2
|
||||
|
||||
# Ensure text stays within bounds
|
||||
text_x = max(0, min(text_x, 1200 - text_width))
|
||||
text_y = max(0, min(text_y, 800 - text_height))
|
||||
|
||||
# Draw background rectangle for maximum contrast
|
||||
bg_x1 = text_x - padding
|
||||
bg_y1 = text_y - padding
|
||||
bg_x2 = text_x + text_width + padding
|
||||
bg_y2 = text_y + text_height + padding
|
||||
|
||||
# Use white background with thick black border for maximum visibility
|
||||
draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill='white', outline='black', width=2)
|
||||
|
||||
# Draw bold dark text on light background for best contrast
|
||||
draw.text((text_x, text_y), text, fill='black', font=font)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f'Failed to draw text overlay: {e}')
|
||||
|
||||
|
||||
def process_element_highlight(
|
||||
element_id: int,
|
||||
element,
|
||||
draw,
|
||||
device_pixel_ratio: float,
|
||||
font,
|
||||
filter_highlight_ids: bool,
|
||||
image_size: tuple[int, int],
|
||||
) -> None:
|
||||
"""Process a single element for highlighting."""
|
||||
try:
|
||||
# Use absolute_position coordinates directly
|
||||
if not element.absolute_position:
|
||||
return
|
||||
|
||||
bounds = element.absolute_position
|
||||
|
||||
# Scale coordinates from CSS pixels to device pixels for screenshot
|
||||
# The screenshot is captured at device pixel resolution, but coordinates are in CSS pixels
|
||||
x1 = int(bounds.x * device_pixel_ratio)
|
||||
y1 = int(bounds.y * device_pixel_ratio)
|
||||
x2 = int((bounds.x + bounds.width) * device_pixel_ratio)
|
||||
y2 = int((bounds.y + bounds.height) * device_pixel_ratio)
|
||||
|
||||
# Ensure coordinates are within image bounds
|
||||
img_width, img_height = image_size
|
||||
x1 = max(0, min(x1, img_width))
|
||||
y1 = max(0, min(y1, img_height))
|
||||
x2 = max(x1, min(x2, img_width))
|
||||
y2 = max(y1, min(y2, img_height))
|
||||
|
||||
# Skip if bounding box is too small or invalid
|
||||
if x2 - x1 < 2 or y2 - y1 < 2:
|
||||
return
|
||||
|
||||
# Get element color based on type
|
||||
tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div'
|
||||
element_type = None
|
||||
if hasattr(element, 'attributes') and element.attributes:
|
||||
element_type = element.attributes.get('type')
|
||||
|
||||
color = get_element_color(tag_name, element_type)
|
||||
|
||||
# Get element index for overlay and apply filtering
|
||||
element_index = getattr(element, 'element_index', None)
|
||||
index_text = None
|
||||
|
||||
if element_index is not None:
|
||||
if filter_highlight_ids:
|
||||
# Use the meaningful text that matches what the LLM sees
|
||||
meaningful_text = element.get_meaningful_text_for_llm()
|
||||
# Show ID only if meaningful text is less than 5 characters
|
||||
if len(meaningful_text) < 5:
|
||||
index_text = str(element_index)
|
||||
else:
|
||||
# Always show ID when filter is disabled
|
||||
index_text = str(element_index)
|
||||
|
||||
# Draw enhanced bounding box with bigger index
|
||||
draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name, image_size)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f'Failed to draw highlight for element {element_id}: {e}')
|
||||
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot')
|
||||
@time_execution_async('create_highlighted_screenshot')
|
||||
async def create_highlighted_screenshot(
|
||||
screenshot_b64: str,
|
||||
selector_map: DOMSelectorMap,
|
||||
device_pixel_ratio: float = 1.0,
|
||||
viewport_offset_x: int = 0,
|
||||
viewport_offset_y: int = 0,
|
||||
filter_highlight_ids: bool = True,
|
||||
) -> str:
|
||||
"""Create a highlighted screenshot with bounding boxes around interactive elements.
|
||||
|
||||
Args:
|
||||
screenshot_b64: Base64 encoded screenshot
|
||||
selector_map: Map of interactive elements with their positions
|
||||
device_pixel_ratio: Device pixel ratio for scaling coordinates
|
||||
viewport_offset_x: X offset for viewport positioning
|
||||
viewport_offset_y: Y offset for viewport positioning
|
||||
|
||||
Returns:
|
||||
Base64 encoded highlighted screenshot
|
||||
"""
|
||||
try:
|
||||
# Decode screenshot
|
||||
screenshot_data = base64.b64decode(screenshot_b64)
|
||||
image = Image.open(io.BytesIO(screenshot_data)).convert('RGBA')
|
||||
|
||||
# Create drawing context
|
||||
draw = ImageDraw.Draw(image)
|
||||
|
||||
# Try to load a font, fall back to default if not available
|
||||
font = None
|
||||
try:
|
||||
font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12)
|
||||
except OSError:
|
||||
try:
|
||||
font = ImageFont.truetype('arial.ttf', 12)
|
||||
except OSError:
|
||||
font = None # Use default font
|
||||
|
||||
# Process elements sequentially to avoid ImageDraw thread safety issues
|
||||
# PIL ImageDraw is not thread-safe, so we process elements one by one
|
||||
for element_id, element in selector_map.items():
|
||||
process_element_highlight(element_id, element, draw, device_pixel_ratio, font, filter_highlight_ids, image.size)
|
||||
|
||||
# Convert back to base64
|
||||
output_buffer = io.BytesIO()
|
||||
image.save(output_buffer, format='PNG')
|
||||
output_buffer.seek(0)
|
||||
|
||||
highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
|
||||
|
||||
logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements')
|
||||
return highlighted_b64
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to create highlighted screenshot: {e}')
|
||||
# Return original screenshot on error
|
||||
return screenshot_b64
|
||||
|
||||
|
||||
async def get_viewport_info_from_cdp(cdp_session) -> tuple[float, int, int]:
|
||||
"""Get viewport information from CDP session.
|
||||
|
||||
Returns:
|
||||
Tuple of (device_pixel_ratio, scroll_x, scroll_y)
|
||||
"""
|
||||
try:
|
||||
# Get layout metrics which includes viewport info and device pixel ratio
|
||||
metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
|
||||
|
||||
# Extract viewport information
|
||||
visual_viewport = metrics.get('visualViewport', {})
|
||||
css_visual_viewport = metrics.get('cssVisualViewport', {})
|
||||
css_layout_viewport = metrics.get('cssLayoutViewport', {})
|
||||
|
||||
# Calculate device pixel ratio
|
||||
css_width = css_visual_viewport.get('clientWidth', css_layout_viewport.get('clientWidth', 1280.0))
|
||||
device_width = visual_viewport.get('clientWidth', css_width)
|
||||
device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0
|
||||
|
||||
# Get scroll position in CSS pixels
|
||||
scroll_x = int(css_visual_viewport.get('pageX', 0))
|
||||
scroll_y = int(css_visual_viewport.get('pageY', 0))
|
||||
|
||||
return float(device_pixel_ratio), scroll_x, scroll_y
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f'Failed to get viewport info from CDP: {e}')
|
||||
return 1.0, 0, 0
|
||||
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot_async')
|
||||
@time_execution_async('create_highlighted_screenshot_async')
|
||||
async def create_highlighted_screenshot_async(
|
||||
screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None, filter_highlight_ids: bool = True
|
||||
) -> str:
|
||||
"""Async wrapper for creating highlighted screenshots.
|
||||
|
||||
Args:
|
||||
screenshot_b64: Base64 encoded screenshot
|
||||
selector_map: Map of interactive elements
|
||||
cdp_session: CDP session for getting viewport info
|
||||
|
||||
Returns:
|
||||
Base64 encoded highlighted screenshot
|
||||
"""
|
||||
# Get viewport information if CDP session is available
|
||||
device_pixel_ratio = 1.0
|
||||
viewport_offset_x = 0
|
||||
viewport_offset_y = 0
|
||||
|
||||
if cdp_session:
|
||||
try:
|
||||
device_pixel_ratio, viewport_offset_x, viewport_offset_y = await get_viewport_info_from_cdp(cdp_session)
|
||||
except Exception as e:
|
||||
logger.debug(f'Failed to get viewport info from CDP: {e}')
|
||||
|
||||
# Create highlighted screenshot with async processing
|
||||
return await create_highlighted_screenshot(
|
||||
screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y, filter_highlight_ids
|
||||
)
|
||||
@@ -39,7 +39,8 @@ from browser_use.browser.events import (
|
||||
from browser_use.browser.profile import BrowserProfile, ProxySettings
|
||||
from browser_use.browser.views import BrowserStateSummary, TabInfo
|
||||
from browser_use.dom.views import EnhancedDOMTreeNode, TargetInfo
|
||||
from browser_use.utils import _log_pretty_url, is_new_tab_page
|
||||
from browser_use.observability import observe_debug
|
||||
from browser_use.utils import _log_pretty_url, is_new_tab_page, time_execution_async
|
||||
|
||||
DEFAULT_BROWSER_PROFILE = BrowserProfile()
|
||||
|
||||
@@ -264,6 +265,7 @@ class BrowserSession(BaseModel):
|
||||
wait_for_network_idle_page_load_time: float | None = None,
|
||||
wait_between_actions: float | None = None,
|
||||
highlight_elements: bool | None = None,
|
||||
filter_highlight_ids: bool | None = None,
|
||||
auto_download_pdfs: bool | None = None,
|
||||
profile_directory: str | None = None,
|
||||
):
|
||||
@@ -536,6 +538,18 @@ class BrowserSession(BaseModel):
|
||||
|
||||
target_id = None
|
||||
|
||||
# If new_tab=True but we're already in a new tab, set new_tab=False
|
||||
if event.new_tab:
|
||||
try:
|
||||
current_url = await self.get_current_page_url()
|
||||
from browser_use.utils import is_new_tab_page
|
||||
|
||||
if is_new_tab_page(current_url):
|
||||
self.logger.debug(f'[on_NavigateToUrlEvent] Already in new tab ({current_url}), setting new_tab=False')
|
||||
event.new_tab = False
|
||||
except Exception as e:
|
||||
self.logger.debug(f'[on_NavigateToUrlEvent] Could not check current URL: {e}')
|
||||
|
||||
# check if the url is already open in a tab somewhere that we're not currently on, if so, short-circuit and just switch to it
|
||||
targets = await self._cdp_get_all_pages()
|
||||
for target in targets:
|
||||
@@ -584,10 +598,18 @@ class BrowserSession(BaseModel):
|
||||
# Use current tab
|
||||
target_id = target_id or self.agent_focus.target_id
|
||||
|
||||
# Activate target (bring to foreground)
|
||||
await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
|
||||
# which does this for us:
|
||||
# self.agent_focus = await self.get_or_create_cdp_session(target_id)
|
||||
# Only switch tab if we're not already on the target tab
|
||||
if self.agent_focus is None or self.agent_focus.target_id != target_id:
|
||||
self.logger.debug(
|
||||
f'[on_NavigateToUrlEvent] Switching to target tab {target_id[-4:]} (current: {self.agent_focus.target_id[-4:] if self.agent_focus else "none"})'
|
||||
)
|
||||
# Activate target (bring to foreground)
|
||||
await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
|
||||
# which does this for us:
|
||||
# self.agent_focus = await self.get_or_create_cdp_session(target_id)
|
||||
else:
|
||||
self.logger.debug(f'[on_NavigateToUrlEvent] Already on target tab {target_id[-4:]}, skipping SwitchTabEvent')
|
||||
|
||||
assert self.agent_focus is not None and self.agent_focus.target_id == target_id, (
|
||||
'Agent focus not updated to new target_id after SwitchTabEvent should have switched to it'
|
||||
)
|
||||
@@ -605,8 +627,8 @@ class BrowserSession(BaseModel):
|
||||
session_id=self.agent_focus.session_id,
|
||||
)
|
||||
|
||||
# Wait a bit to ensure page starts loading
|
||||
await asyncio.sleep(0.5)
|
||||
# # Wait a bit to ensure page starts loading
|
||||
# await asyncio.sleep(0.5)
|
||||
|
||||
# Dispatch navigation complete
|
||||
self.logger.debug(f'Dispatching NavigationCompleteEvent for {event.url} (tab #{target_id[-4:]})')
|
||||
@@ -678,8 +700,8 @@ class BrowserSession(BaseModel):
|
||||
"""Handle tab closure - update focus if needed."""
|
||||
|
||||
cdp_session = await self.get_or_create_cdp_session(target_id=None, focus=False)
|
||||
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id})
|
||||
await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id))
|
||||
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id})
|
||||
|
||||
async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
|
||||
"""Handle tab closure - update focus if needed."""
|
||||
@@ -791,6 +813,8 @@ class BrowserSession(BaseModel):
|
||||
assert self._cdp_client_root is not None, 'CDP client not initialized - browser may not be connected yet'
|
||||
return self._cdp_client_root
|
||||
|
||||
@time_execution_async('get_or_create_cdp_session')
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='get_or_create_cdp_session')
|
||||
async def get_or_create_cdp_session(
|
||||
self, target_id: TargetID | None = None, focus: bool = True, new_socket: bool | None = None
|
||||
) -> CDPSession:
|
||||
@@ -845,6 +869,8 @@ class BrowserSession(BaseModel):
|
||||
cdp_url=self.cdp_url if should_use_new_socket else None,
|
||||
)
|
||||
self._cdp_session_pool[target_id] = session
|
||||
# log length of _cdp_session_pool
|
||||
self.logger.debug(f'[get_or_create_cdp_session] new _cdp_session_pool length: {len(self._cdp_session_pool)}')
|
||||
|
||||
# Only change agent focus if requested
|
||||
if focus:
|
||||
@@ -870,7 +896,7 @@ class BrowserSession(BaseModel):
|
||||
return self.agent_focus.session_id if self.agent_focus else None
|
||||
|
||||
# ========== Helper Methods ==========
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_summary')
|
||||
async def get_browser_state_summary(
|
||||
self,
|
||||
cache_clickable_elements_hashes: bool = True,
|
||||
@@ -1321,6 +1347,7 @@ class BrowserSession(BaseModel):
|
||||
except Exception as e:
|
||||
self.logger.debug(f'Skipping proxy auth setup: {type(e).__name__}: {e}')
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='get_tabs')
|
||||
async def get_tabs(self) -> list[TabInfo]:
|
||||
"""Get information about all open tabs using CDP Target.getTargetInfo for speed."""
|
||||
tabs = []
|
||||
@@ -1399,6 +1426,7 @@ class BrowserSession(BaseModel):
|
||||
return target
|
||||
return None
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='get_current_page_url')
|
||||
async def get_current_page_url(self) -> str:
|
||||
"""Get the URL of the current page using CDP."""
|
||||
target = await self.get_current_target_info()
|
||||
@@ -1519,6 +1547,9 @@ class BrowserSession(BaseModel):
|
||||
|
||||
async def remove_highlights(self) -> None:
|
||||
"""Remove highlights from the page using CDP."""
|
||||
if not self.browser_profile.highlight_elements:
|
||||
return
|
||||
|
||||
try:
|
||||
# Get cached session
|
||||
cdp_session = await self.get_or_create_cdp_session()
|
||||
|
||||
@@ -126,17 +126,42 @@ class BrowserStateHistory:
|
||||
|
||||
|
||||
class BrowserError(Exception):
|
||||
"""Base class for all browser errors"""
|
||||
"""Browser error with structured memory for LLM context management.
|
||||
|
||||
This exception class provides separate memory contexts for browser actions:
|
||||
- short_term_memory: Immediate context shown once to the LLM for the next action
|
||||
- long_term_memory: Persistent error information stored across steps
|
||||
"""
|
||||
|
||||
message: str
|
||||
short_term_memory: str | None = None
|
||||
long_term_memory: str | None = None
|
||||
details: dict[str, Any] | None = None
|
||||
while_handling_event: BaseEvent[Any] | None = None
|
||||
|
||||
def __init__(self, message: str, details: dict[str, Any] | None = None, event: BaseEvent[Any] | None = None):
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
short_term_memory: str | None = None,
|
||||
long_term_memory: str | None = None,
|
||||
details: dict[str, Any] | None = None,
|
||||
event: BaseEvent[Any] | None = None,
|
||||
):
|
||||
"""Initialize a BrowserError with structured memory contexts.
|
||||
|
||||
Args:
|
||||
message: Technical error message for logging and debugging
|
||||
short_term_memory: Context shown once to LLM (e.g., available actions, options)
|
||||
long_term_memory: Persistent error info stored in agent memory
|
||||
details: Additional metadata for debugging
|
||||
event: The browser event that triggered this error
|
||||
"""
|
||||
self.message = message
|
||||
super().__init__(message)
|
||||
self.short_term_memory = short_term_memory
|
||||
self.long_term_memory = long_term_memory
|
||||
self.details = details
|
||||
self.while_handling_event = event
|
||||
super().__init__(message)
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.details:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -16,6 +16,8 @@ from browser_use.dom.views import (
|
||||
EnhancedDOMTreeNode,
|
||||
SerializedDOMState,
|
||||
)
|
||||
from browser_use.observability import observe_debug
|
||||
from browser_use.utils import time_execution_async
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from browser_use.browser.views import BrowserStateSummary, PageInfo
|
||||
@@ -42,70 +44,7 @@ class DOMWatchdog(BaseWatchdog):
|
||||
|
||||
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
|
||||
# self.logger.debug('Setting up init scripts in browser')
|
||||
|
||||
self.logger.debug('💉 Injecting DOM Service init script to track event listeners added to DOM elements by JS...')
|
||||
|
||||
init_script = """
|
||||
// check to make sure we're not inside the PDF viewer
|
||||
window.isPdfViewer = !!document?.body?.querySelector('body > embed[type="application/pdf"][width="100%"]')
|
||||
if (!window.isPdfViewer) {
|
||||
|
||||
// Permissions
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) => (
|
||||
parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters)
|
||||
);
|
||||
(() => {
|
||||
if (window._eventListenerTrackerInitialized) return;
|
||||
window._eventListenerTrackerInitialized = true;
|
||||
|
||||
const originalAddEventListener = EventTarget.prototype.addEventListener;
|
||||
const eventListenersMap = new WeakMap();
|
||||
|
||||
EventTarget.prototype.addEventListener = function(type, listener, options) {
|
||||
if (typeof listener === "function") {
|
||||
let listeners = eventListenersMap.get(this);
|
||||
if (!listeners) {
|
||||
listeners = [];
|
||||
eventListenersMap.set(this, listeners);
|
||||
}
|
||||
|
||||
listeners.push({
|
||||
type,
|
||||
listener,
|
||||
listenerPreview: listener.toString().slice(0, 100),
|
||||
options
|
||||
});
|
||||
}
|
||||
|
||||
return originalAddEventListener.call(this, type, listener, options);
|
||||
};
|
||||
|
||||
window.getEventListenersForNode = (node) => {
|
||||
const listeners = eventListenersMap.get(node) || [];
|
||||
return listeners.map(({ type, listenerPreview, options }) => ({
|
||||
type,
|
||||
listenerPreview,
|
||||
options
|
||||
}));
|
||||
};
|
||||
})();
|
||||
}
|
||||
"""
|
||||
|
||||
# Try to inject the script, but don't fail if the Page domain isn't ready yet
|
||||
# This can happen when a new tab is created and the CDP session isn't fully attached
|
||||
try:
|
||||
await self.browser_session._cdp_add_init_script(init_script)
|
||||
except Exception as e:
|
||||
if "'Page.addScriptToEvaluateOnNewDocument' wasn't found" in str(e):
|
||||
self.logger.debug(f'Page domain not ready for new tab, skipping init script injection: {e}')
|
||||
# The script will be injected when the page actually navigates
|
||||
else:
|
||||
# Re-raise other errors
|
||||
raise
|
||||
return None
|
||||
|
||||
def _get_recent_events_str(self, limit: int = 10) -> str | None:
|
||||
"""Get the most recent events from the event bus as JSON.
|
||||
@@ -164,10 +103,10 @@ class DOMWatchdog(BaseWatchdog):
|
||||
self.logger.debug(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Got page URL: {page_url}')
|
||||
if self.browser_session.agent_focus:
|
||||
self.logger.debug(
|
||||
f'📍 Current page URL: {page_url}, target_id: {self.browser_session.agent_focus.target_id}, session_id: {self.browser_session.agent_focus.session_id}'
|
||||
f'Current page URL: {page_url}, target_id: {self.browser_session.agent_focus.target_id}, session_id: {self.browser_session.agent_focus.session_id}'
|
||||
)
|
||||
else:
|
||||
self.logger.debug(f'📍 Current page URL: {page_url}, no cdp_session attached')
|
||||
self.logger.debug(f'Current page URL: {page_url}, no cdp_session attached')
|
||||
|
||||
# check if we should skip DOM tree build for pointless pages
|
||||
not_a_meaningful_website = page_url.lower().split(':', 1)[0] not in ('http', 'https')
|
||||
@@ -243,65 +182,73 @@ class DOMWatchdog(BaseWatchdog):
|
||||
recent_events=self._get_recent_events_str() if event.include_recent_events else None,
|
||||
)
|
||||
|
||||
# Normal path: Build DOM tree if requested
|
||||
if event.include_dom:
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Building DOM tree...')
|
||||
# Execute DOM building and screenshot capture in parallel
|
||||
dom_task = None
|
||||
screenshot_task = None
|
||||
|
||||
# Start DOM building task if requested
|
||||
if event.include_dom:
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Starting DOM tree build task...')
|
||||
|
||||
# Build the DOM directly using the internal method
|
||||
previous_state = (
|
||||
self.browser_session._cached_browser_state_summary.dom_state
|
||||
if self.browser_session._cached_browser_state_summary
|
||||
else None
|
||||
)
|
||||
|
||||
dom_task = asyncio.create_task(self._build_dom_tree_without_highlights(previous_state))
|
||||
|
||||
# Start clean screenshot task if requested (without JS highlights)
|
||||
if event.include_screenshot:
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 Starting clean screenshot task...')
|
||||
screenshot_task = asyncio.create_task(self._capture_clean_screenshot())
|
||||
|
||||
# Wait for both tasks to complete
|
||||
content = None
|
||||
screenshot_b64 = None
|
||||
|
||||
if dom_task:
|
||||
try:
|
||||
# Call the DOM building method directly
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: Starting _build_dom_tree...')
|
||||
content = await self._build_dom_tree(previous_state)
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ _build_dom_tree completed')
|
||||
content = await dom_task
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ DOM tree build completed')
|
||||
except Exception as e:
|
||||
self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: DOM build failed: {e}, using minimal state')
|
||||
content = SerializedDOMState(_root=None, selector_map={})
|
||||
|
||||
if not content:
|
||||
# Fallback to minimal DOM state
|
||||
self.logger.warning('DOM build returned no content, using minimal state')
|
||||
content = SerializedDOMState(_root=None, selector_map={})
|
||||
else:
|
||||
# Skip DOM building if not requested
|
||||
content = SerializedDOMState(_root=None, selector_map={})
|
||||
|
||||
# re-focus top-level page session context
|
||||
assert self.browser_session.agent_focus is not None, 'No current target ID'
|
||||
await self.browser_session.get_or_create_cdp_session(target_id=self.browser_session.agent_focus.target_id, focus=True)
|
||||
|
||||
# Get screenshot if requested
|
||||
screenshot_b64 = None
|
||||
if event.include_screenshot:
|
||||
self.logger.debug(
|
||||
f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 DOM watchdog requesting screenshot, include_screenshot={event.include_screenshot}'
|
||||
)
|
||||
if screenshot_task:
|
||||
try:
|
||||
# Check if handler is registered
|
||||
handlers = self.event_bus.handlers.get('ScreenshotEvent', [])
|
||||
handler_names = [getattr(h, '__name__', str(h)) for h in handlers]
|
||||
self.logger.debug(f'📸 ScreenshotEvent handlers registered: {len(handlers)} - {handler_names}')
|
||||
|
||||
screenshot_event = self.event_bus.dispatch(ScreenshotEvent(full_page=False))
|
||||
self.logger.debug('📸 Dispatched ScreenshotEvent, waiting for event to complete...')
|
||||
|
||||
# Wait for the event itself to complete (this waits for all handlers)
|
||||
await screenshot_event
|
||||
|
||||
# Get the single handler result
|
||||
screenshot_b64 = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True)
|
||||
except TimeoutError:
|
||||
self.logger.warning('📸 Screenshot timed out after 6 seconds - no handler registered or slow page?')
|
||||
|
||||
screenshot_b64 = await screenshot_task
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Clean screenshot captured')
|
||||
except Exception as e:
|
||||
self.logger.warning(f'📸 Screenshot failed: {type(e).__name__}: {e}')
|
||||
else:
|
||||
self.logger.debug(f'📸 Skipping screenshot, include_screenshot={event.include_screenshot}')
|
||||
self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Clean screenshot failed: {e}')
|
||||
screenshot_b64 = None
|
||||
|
||||
# Apply Python-based highlighting if both DOM and screenshot are available
|
||||
if screenshot_b64 and content and content.selector_map and self.browser_session.browser_profile.highlight_elements:
|
||||
try:
|
||||
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🎨 Applying Python-based highlighting...')
|
||||
from browser_use.browser.python_highlights import create_highlighted_screenshot_async
|
||||
|
||||
# Get CDP session for viewport info
|
||||
cdp_session = await self.browser_session.get_or_create_cdp_session()
|
||||
start = time.time()
|
||||
screenshot_b64 = await create_highlighted_screenshot_async(
|
||||
screenshot_b64,
|
||||
content.selector_map,
|
||||
cdp_session,
|
||||
self.browser_session.browser_profile.filter_highlight_ids,
|
||||
)
|
||||
self.logger.debug(
|
||||
f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements in {time.time() - start:.2f}s'
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}')
|
||||
|
||||
# Ensure we have valid content
|
||||
if not content:
|
||||
content = SerializedDOMState(_root=None, selector_map={})
|
||||
|
||||
# Tabs info already fetched at the beginning
|
||||
|
||||
@@ -452,7 +399,7 @@ class DOMWatchdog(BaseWatchdog):
|
||||
self.logger.debug(f'🔍 DOMWatchdog._build_dom_tree: ✅ Selector maps updated, {len(self.selector_map)} elements')
|
||||
|
||||
# Inject highlighting for visual feedback if we have elements
|
||||
if self.selector_map and self._dom_service:
|
||||
if self.selector_map and self._dom_service and self.browser_session.browser_profile.highlight_elements:
|
||||
try:
|
||||
self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Injecting highlighting script...')
|
||||
from browser_use.dom.debug.highlights import inject_highlighting_script
|
||||
@@ -463,6 +410,8 @@ class DOMWatchdog(BaseWatchdog):
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.debug(f'🔍 DOMWatchdog._build_dom_tree: Failed to inject highlighting: {e}')
|
||||
elif self.selector_map and self._dom_service and not self.browser_session.browser_profile.highlight_elements:
|
||||
self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Skipping highlighting injection - highlight_elements=False')
|
||||
|
||||
self.logger.debug('🔍 DOMWatchdog._build_dom_tree: ✅ COMPLETED DOM tree build')
|
||||
return self.current_dom_state
|
||||
@@ -477,6 +426,95 @@ class DOMWatchdog(BaseWatchdog):
|
||||
)
|
||||
raise
|
||||
|
||||
@time_execution_async('build_dom_tree_without_highlights')
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='build_dom_tree_without_highlights')
|
||||
async def _build_dom_tree_without_highlights(self, previous_state: SerializedDOMState | None = None) -> SerializedDOMState:
|
||||
"""Build DOM tree without injecting JavaScript highlights (for parallel execution)."""
|
||||
try:
|
||||
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: STARTING DOM tree build')
|
||||
|
||||
# Create or reuse DOM service
|
||||
if self._dom_service is None:
|
||||
self._dom_service = DomService(
|
||||
browser_session=self.browser_session,
|
||||
logger=self.logger,
|
||||
cross_origin_iframes=self.browser_session.browser_profile.cross_origin_iframes,
|
||||
)
|
||||
|
||||
# Get serialized DOM tree using the service
|
||||
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: Calling DomService.get_serialized_dom_tree...')
|
||||
start = time.time()
|
||||
self.current_dom_state, self.enhanced_dom_tree, timing_info = await self._dom_service.get_serialized_dom_tree(
|
||||
previous_cached_state=previous_state,
|
||||
)
|
||||
end = time.time()
|
||||
self.logger.debug(
|
||||
'🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ DomService.get_serialized_dom_tree completed'
|
||||
)
|
||||
|
||||
self.logger.debug(f'Time taken to get DOM tree: {end - start} seconds')
|
||||
self.logger.debug(f'Timing breakdown: {timing_info}')
|
||||
|
||||
# Update selector map for other watchdogs
|
||||
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: Updating selector maps...')
|
||||
self.selector_map = self.current_dom_state.selector_map
|
||||
# Update BrowserSession's cached selector map
|
||||
if self.browser_session:
|
||||
self.browser_session.update_cached_selector_map(self.selector_map)
|
||||
self.logger.debug(
|
||||
f'🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ Selector maps updated, {len(self.selector_map)} elements'
|
||||
)
|
||||
|
||||
# Skip JavaScript highlighting injection - Python highlighting will be applied later
|
||||
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ COMPLETED DOM tree build (no JS highlights)')
|
||||
return self.current_dom_state
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f'Failed to build DOM tree without highlights: {e}')
|
||||
self.event_bus.dispatch(
|
||||
BrowserErrorEvent(
|
||||
error_type='DOMBuildFailed',
|
||||
message=str(e),
|
||||
)
|
||||
)
|
||||
raise
|
||||
|
||||
@time_execution_async('capture_clean_screenshot')
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='capture_clean_screenshot')
|
||||
async def _capture_clean_screenshot(self) -> str:
|
||||
"""Capture a clean screenshot without JavaScript highlights."""
|
||||
try:
|
||||
self.logger.debug('🔍 DOMWatchdog._capture_clean_screenshot: Capturing clean screenshot...')
|
||||
|
||||
# Ensure we have a focused CDP session
|
||||
assert self.browser_session.agent_focus is not None, 'No current target ID'
|
||||
await self.browser_session.get_or_create_cdp_session(target_id=self.browser_session.agent_focus.target_id, focus=True)
|
||||
|
||||
# Check if handler is registered
|
||||
handlers = self.event_bus.handlers.get('ScreenshotEvent', [])
|
||||
handler_names = [getattr(h, '__name__', str(h)) for h in handlers]
|
||||
self.logger.debug(f'📸 ScreenshotEvent handlers registered: {len(handlers)} - {handler_names}')
|
||||
|
||||
screenshot_event = self.event_bus.dispatch(ScreenshotEvent(full_page=False))
|
||||
self.logger.debug('📸 Dispatched ScreenshotEvent, waiting for event to complete...')
|
||||
|
||||
# Wait for the event itself to complete (this waits for all handlers)
|
||||
await screenshot_event
|
||||
|
||||
# Get the single handler result
|
||||
screenshot_b64 = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True)
|
||||
if screenshot_b64 is None:
|
||||
raise RuntimeError('Screenshot handler returned None')
|
||||
self.logger.debug('🔍 DOMWatchdog._capture_clean_screenshot: ✅ Clean screenshot captured successfully')
|
||||
return str(screenshot_b64)
|
||||
|
||||
except TimeoutError:
|
||||
self.logger.warning('📸 Clean screenshot timed out after 6 seconds - no handler registered or slow page?')
|
||||
raise
|
||||
except Exception as e:
|
||||
self.logger.warning(f'📸 Clean screenshot failed: {type(e).__name__}: {e}')
|
||||
raise
|
||||
|
||||
async def _wait_for_stable_network(self):
|
||||
"""Wait for page stability - simplified for CDP-only branch."""
|
||||
start_time = time.time()
|
||||
@@ -496,6 +534,7 @@ class DOMWatchdog(BaseWatchdog):
|
||||
elapsed = time.time() - start_time
|
||||
self.logger.debug(f'✅ Page stability wait completed in {elapsed:.2f}s')
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='get_page_info')
|
||||
async def _get_page_info(self) -> 'PageInfo':
|
||||
"""Get comprehensive page information using a single CDP call.
|
||||
|
||||
|
||||
@@ -111,9 +111,10 @@ class DownloadsWatchdog(BaseWatchdog):
|
||||
# Check if auto-download is enabled
|
||||
auto_download_enabled = self._is_auto_download_enabled()
|
||||
if not auto_download_enabled:
|
||||
self.logger.debug('[DownloadsWatchdog] Skipping PDF check - auto-download disabled')
|
||||
return
|
||||
|
||||
# Note: Using network-based PDF detection that doesn't require JavaScript
|
||||
|
||||
target_id = event.target_id
|
||||
self.logger.debug(f'[DownloadsWatchdog] Got target_id={target_id} for tab #{event.target_id[-4:]}')
|
||||
|
||||
@@ -552,8 +553,9 @@ class DownloadsWatchdog(BaseWatchdog):
|
||||
del self._active_downloads[download_id]
|
||||
|
||||
async def check_for_pdf_viewer(self, target_id: TargetID) -> bool:
|
||||
"""Check if the current target is Chrome's built-in PDF viewer.
|
||||
"""Check if the current target is a PDF using network-based detection.
|
||||
|
||||
This method avoids JavaScript execution that can crash WebSocket connections.
|
||||
Returns True if a PDF is detected and should be downloaded.
|
||||
"""
|
||||
self.logger.debug(f'[DownloadsWatchdog] Checking if target {target_id} is PDF viewer...')
|
||||
@@ -575,98 +577,115 @@ class DownloadsWatchdog(BaseWatchdog):
|
||||
return cached_result
|
||||
|
||||
try:
|
||||
# Create a temporary CDP session for this target without switching focus
|
||||
import asyncio
|
||||
|
||||
temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
|
||||
|
||||
result = await asyncio.wait_for(
|
||||
temp_session.cdp_client.send.Runtime.evaluate(
|
||||
params={
|
||||
'expression': """
|
||||
(() => {
|
||||
// Check for Chrome's built-in PDF viewer (both old and new selectors)
|
||||
const pdfEmbed = document.querySelector('embed[type="application/x-google-chrome-pdf"]') ||
|
||||
document.querySelector('embed[type="application/pdf"]');
|
||||
if (pdfEmbed) {
|
||||
// For Chrome PDF viewer, use window.location.href not embed.src (which is often about:blank)
|
||||
return {
|
||||
isPdf: true,
|
||||
url: window.location.href,
|
||||
isChromePdfViewer: true
|
||||
};
|
||||
}
|
||||
|
||||
// Check for direct PDF navigation
|
||||
if (document.contentType === 'application/pdf') {
|
||||
return {
|
||||
isPdf: true,
|
||||
url: window.location.href,
|
||||
isDirectPdf: true
|
||||
};
|
||||
}
|
||||
|
||||
// Also check if the URL ends with .pdf or has PDF in it
|
||||
const url = window.location.href;
|
||||
const isPdfUrl = url.toLowerCase().includes('.pdf');
|
||||
if (isPdfUrl) {
|
||||
return {
|
||||
isPdf: true,
|
||||
url: url,
|
||||
isPdfUrl: true
|
||||
};
|
||||
}
|
||||
|
||||
// Check for PDF in iframe
|
||||
const iframes = document.querySelectorAll('iframe');
|
||||
for (const iframe of iframes) {
|
||||
try {
|
||||
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
||||
if (iframeDoc.contentType === 'application/pdf') {
|
||||
return {
|
||||
isPdf: true,
|
||||
url: iframe.src,
|
||||
isIframePdf: true
|
||||
};
|
||||
}
|
||||
} catch (e) {
|
||||
// Cross-origin iframe, skip
|
||||
}
|
||||
}
|
||||
|
||||
return { isPdf: false };
|
||||
})()
|
||||
""",
|
||||
'returnByValue': True,
|
||||
},
|
||||
session_id=temp_session.session_id,
|
||||
),
|
||||
timeout=5.0, # 5 second timeout to prevent hanging
|
||||
)
|
||||
|
||||
# No need to detach - session is cached
|
||||
is_pdf_viewer = result.get('result', {}).get('value', {})
|
||||
|
||||
if is_pdf_viewer.get('isPdf', False):
|
||||
self.logger.debug(
|
||||
f'[DownloadsWatchdog] PDF detected: {is_pdf_viewer.get("url", "unknown")} '
|
||||
f'(type: {"Chrome viewer" if is_pdf_viewer.get("isChromePdfViewer") else "direct PDF" if is_pdf_viewer.get("isDirectPdf") else "PDF URL" if is_pdf_viewer.get("isPdfUrl") else "iframe PDF"})'
|
||||
)
|
||||
# Method 1: Check URL patterns (fastest, most reliable)
|
||||
url_is_pdf = self._check_url_for_pdf(page_url)
|
||||
if url_is_pdf:
|
||||
self.logger.debug(f'[DownloadsWatchdog] PDF detected via URL pattern: {page_url}')
|
||||
self._pdf_viewer_cache[page_url] = True
|
||||
return True
|
||||
|
||||
# Method 2: Check network response headers via CDP (safer than JavaScript)
|
||||
header_is_pdf = await self._check_network_headers_for_pdf(target_id)
|
||||
if header_is_pdf:
|
||||
self.logger.debug(f'[DownloadsWatchdog] PDF detected via network headers: {page_url}')
|
||||
self._pdf_viewer_cache[page_url] = True
|
||||
return True
|
||||
|
||||
# Method 3: Check Chrome's PDF viewer specific URLs
|
||||
chrome_pdf_viewer = self._is_chrome_pdf_viewer_url(page_url)
|
||||
if chrome_pdf_viewer:
|
||||
self.logger.debug(f'[DownloadsWatchdog] Chrome PDF viewer detected: {page_url}')
|
||||
self._pdf_viewer_cache[page_url] = True
|
||||
return True
|
||||
|
||||
# Not a PDF
|
||||
self._pdf_viewer_cache[page_url] = False
|
||||
return False
|
||||
|
||||
except TimeoutError:
|
||||
self.logger.warning(f'[DownloadsWatchdog] ❌ PDF check timed out for target: {page_url}')
|
||||
self._pdf_viewer_cache[page_url] = False
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.warning(f'[DownloadsWatchdog] ❌ Error checking for PDF viewer: {e}')
|
||||
self._pdf_viewer_cache[page_url] = False
|
||||
return False
|
||||
|
||||
def _check_url_for_pdf(self, url: str) -> bool:
|
||||
"""Check if URL indicates a PDF file."""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
url_lower = url.lower()
|
||||
|
||||
# Direct PDF file extensions
|
||||
if url_lower.endswith('.pdf'):
|
||||
return True
|
||||
|
||||
# PDF in path
|
||||
if '.pdf' in url_lower:
|
||||
return True
|
||||
|
||||
# PDF MIME type in URL parameters
|
||||
if any(
|
||||
param in url_lower
|
||||
for param in [
|
||||
'content-type=application/pdf',
|
||||
'content-type=application%2fpdf',
|
||||
'mimetype=application/pdf',
|
||||
'type=application/pdf',
|
||||
]
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_chrome_pdf_viewer_url(self, url: str) -> bool:
|
||||
"""Check if this is Chrome's internal PDF viewer URL."""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
url_lower = url.lower()
|
||||
|
||||
# Chrome PDF viewer uses chrome-extension:// URLs
|
||||
if 'chrome-extension://' in url_lower and 'pdf' in url_lower:
|
||||
return True
|
||||
|
||||
# Chrome PDF viewer internal URLs
|
||||
if url_lower.startswith('chrome://') and 'pdf' in url_lower:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def _check_network_headers_for_pdf(self, target_id: TargetID) -> bool:
|
||||
"""Infer PDF via navigation history/URL; headers are not available post-navigation in this context."""
|
||||
try:
|
||||
import asyncio
|
||||
|
||||
# Get CDP session
|
||||
temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
|
||||
|
||||
# Get navigation history to find the main resource
|
||||
history = await asyncio.wait_for(
|
||||
temp_session.cdp_client.send.Page.getNavigationHistory(session_id=temp_session.session_id), timeout=3.0
|
||||
)
|
||||
|
||||
current_entry = history.get('entries', [])
|
||||
if current_entry:
|
||||
current_index = history.get('currentIndex', 0)
|
||||
if 0 <= current_index < len(current_entry):
|
||||
current_url = current_entry[current_index].get('url', '')
|
||||
|
||||
# Check if the URL itself suggests PDF
|
||||
if self._check_url_for_pdf(current_url):
|
||||
return True
|
||||
|
||||
# Note: CDP doesn't easily expose response headers for completed navigations
|
||||
# For more complex cases, we'd need to set up Network.responseReceived listeners
|
||||
# before navigation, but that's overkill for most PDF detection cases
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f'[DownloadsWatchdog] Network headers check failed (non-critical): {e}')
|
||||
return False
|
||||
|
||||
async def trigger_pdf_download(self, target_id: TargetID) -> str | None:
|
||||
"""Trigger download of a PDF from Chrome's PDF viewer.
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ async def inject_highlighting_script(dom_service: DomService, interactive_elemen
|
||||
# Convert DOMSelectorMap to the format expected by the JavaScript
|
||||
converted_elements = convert_dom_selector_map_to_highlight_format(interactive_elements)
|
||||
|
||||
logger.debug(f'📍 Creating CSP-safe highlighting for {len(converted_elements)} elements')
|
||||
logger.debug(f'Creating CSP-safe highlighting for {len(converted_elements)} elements')
|
||||
|
||||
# ALWAYS remove any existing highlights first to prevent double-highlighting
|
||||
await remove_highlighting_script(dom_service)
|
||||
|
||||
@@ -20,7 +20,7 @@ class ClickableElementDetector:
|
||||
|
||||
# IFRAME elements should be interactive if they're large enough to potentially need scrolling
|
||||
# Small iframes (< 100px width or height) are unlikely to have scrollable content
|
||||
if node.tag_name and node.tag_name.upper() == 'IFRAME':
|
||||
if node.tag_name and node.tag_name.upper() == 'IFRAME' or node.tag_name.upper() == 'FRAME':
|
||||
if node.snapshot_node and node.snapshot_node.bounds:
|
||||
width = node.snapshot_node.bounds.width
|
||||
height = node.snapshot_node.bounds.height
|
||||
@@ -94,14 +94,14 @@ class ClickableElementDetector:
|
||||
# Skip properties we can't process
|
||||
continue
|
||||
|
||||
# ENHANCED TAG CHECK: Include truly interactive elements
|
||||
# ENHANCED TAG CHECK: Include truly interactive elements
|
||||
# Note: 'label' removed - labels are handled by other attribute checks below - other wise labels with "for" attribute can destroy the real clickable element on apartments.com
|
||||
interactive_tags = {
|
||||
'button',
|
||||
'input',
|
||||
'select',
|
||||
'textarea',
|
||||
'a',
|
||||
'label',
|
||||
'details',
|
||||
'summary',
|
||||
'option',
|
||||
|
||||
@@ -143,10 +143,10 @@ class DOMTreeSerializer:
|
||||
if node.node_name.lower() in DISABLED_ELEMENTS:
|
||||
return None
|
||||
|
||||
if node.node_name == 'IFRAME':
|
||||
if node.node_name == 'IFRAME' or node.node_name == 'FRAME':
|
||||
if node.content_document:
|
||||
simplified = SimplifiedNode(original_node=node, children=[])
|
||||
for child in node.content_document.children:
|
||||
for child in node.content_document.children_nodes or []:
|
||||
simplified_child = self._create_simplified_tree(child)
|
||||
if simplified_child:
|
||||
simplified.children.append(simplified_child)
|
||||
@@ -159,7 +159,7 @@ class DOMTreeSerializer:
|
||||
is_scrollable = node.is_actually_scrollable
|
||||
|
||||
# Include if interactive (regardless of visibility), or scrollable, or has children to process
|
||||
should_include = (is_interactive and is_visible) or is_scrollable or node.children_and_shadow_roots
|
||||
should_include = (is_interactive and is_visible) or is_scrollable or bool(node.children_and_shadow_roots)
|
||||
|
||||
if should_include:
|
||||
simplified = SimplifiedNode(original_node=node, children=[])
|
||||
@@ -435,7 +435,12 @@ class DOMTreeSerializer:
|
||||
# Add element with interactive_index if clickable, scrollable, or iframe
|
||||
is_any_scrollable = node.original_node.is_actually_scrollable or node.original_node.is_scrollable
|
||||
should_show_scroll = node.original_node.should_show_scroll_info
|
||||
if node.interactive_index is not None or is_any_scrollable or node.original_node.tag_name.upper() == 'IFRAME':
|
||||
if (
|
||||
node.interactive_index is not None
|
||||
or is_any_scrollable
|
||||
or node.original_node.tag_name.upper() == 'IFRAME'
|
||||
or node.original_node.tag_name.upper() == 'FRAME'
|
||||
):
|
||||
next_depth += 1
|
||||
|
||||
# Build attributes string
|
||||
@@ -453,6 +458,9 @@ class DOMTreeSerializer:
|
||||
elif node.original_node.tag_name.upper() == 'IFRAME':
|
||||
# Iframe element (not interactive)
|
||||
line = f'{depth_str}|IFRAME|<{node.original_node.tag_name}'
|
||||
elif node.original_node.tag_name.upper() == 'FRAME':
|
||||
# Frame element (not interactive)
|
||||
line = f'{depth_str}|FRAME|<{node.original_node.tag_name}'
|
||||
else:
|
||||
line = f'{depth_str}<{node.original_node.tag_name}'
|
||||
|
||||
|
||||
@@ -187,7 +187,7 @@ class DomService:
|
||||
for frame in reversed(html_frames):
|
||||
if (
|
||||
frame.node_type == NodeType.ELEMENT_NODE
|
||||
and frame.node_name.upper() == 'IFRAME'
|
||||
and (frame.node_name.upper() == 'IFRAME' or frame.node_name.upper() == 'FRAME')
|
||||
and frame.snapshot_node
|
||||
and frame.snapshot_node.bounds
|
||||
):
|
||||
@@ -561,7 +561,11 @@ class DomService:
|
||||
)
|
||||
|
||||
# Calculate new iframe offset for content documents, accounting for iframe scroll
|
||||
if node['nodeName'].upper() == 'IFRAME' and snapshot_data and snapshot_data.bounds:
|
||||
if (
|
||||
(node['nodeName'].upper() == 'IFRAME' or node['nodeName'].upper() == 'FRAME')
|
||||
and snapshot_data
|
||||
and snapshot_data.bounds
|
||||
):
|
||||
if snapshot_data.bounds:
|
||||
updated_html_frames.append(dom_tree_node)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ from cdp_use.cdp.target.types import SessionID, TargetID, TargetInfo
|
||||
from uuid_extensions import uuid7str
|
||||
|
||||
from browser_use.dom.utils import cap_text_length
|
||||
from browser_use.observability import observe_debug
|
||||
|
||||
# Serializer types
|
||||
DEFAULT_INCLUDE_ATTRIBUTES = [
|
||||
@@ -91,14 +92,28 @@ class SimplifiedNode:
|
||||
is_new: bool = False
|
||||
excluded_by_parent: bool = False # New field for bbox filtering
|
||||
|
||||
def _clean_original_node_json(self, node_json: dict) -> dict:
|
||||
"""Recursively remove children_nodes and shadow_roots from original_node JSON."""
|
||||
# Remove the fields we don't want in SimplifiedNode serialization
|
||||
if 'children_nodes' in node_json:
|
||||
del node_json['children_nodes']
|
||||
if 'shadow_roots' in node_json:
|
||||
del node_json['shadow_roots']
|
||||
|
||||
# Clean nested content_document if it exists
|
||||
if node_json.get('content_document'):
|
||||
node_json['content_document'] = self._clean_original_node_json(node_json['content_document'])
|
||||
|
||||
return node_json
|
||||
|
||||
def __json__(self) -> dict:
|
||||
original_node_json = self.original_node.__json__()
|
||||
del original_node_json['children_nodes']
|
||||
del original_node_json['shadow_roots']
|
||||
# Remove children_nodes and shadow_roots to avoid duplication with SimplifiedNode.children
|
||||
cleaned_original_node_json = self._clean_original_node_json(original_node_json)
|
||||
return {
|
||||
'should_display': self.should_display,
|
||||
'interactive_index': self.interactive_index,
|
||||
'original_node': original_node_json,
|
||||
'original_node': cleaned_original_node_json,
|
||||
'children': [c.__json__() for c in self.children],
|
||||
}
|
||||
|
||||
@@ -412,6 +427,25 @@ class EnhancedDOMTreeNode:
|
||||
|
||||
return f'<{self.tag_name}>{cap_text_length(self.get_all_children_text(), max_text_length) or ""}'
|
||||
|
||||
def get_meaningful_text_for_llm(self) -> str:
|
||||
"""
|
||||
Get the meaningful text content that the LLM actually sees for this element.
|
||||
This matches exactly what goes into the DOMTreeSerializer output.
|
||||
"""
|
||||
meaningful_text = ''
|
||||
if hasattr(self, 'attributes') and self.attributes:
|
||||
# Priority order: value, aria-label, title, placeholder, alt, text content
|
||||
for attr in ['value', 'aria-label', 'title', 'placeholder', 'alt']:
|
||||
if attr in self.attributes and self.attributes[attr]:
|
||||
meaningful_text = self.attributes[attr]
|
||||
break
|
||||
|
||||
# Fallback to text content if no meaningful attributes
|
||||
if not meaningful_text:
|
||||
meaningful_text = self.get_all_children_text()
|
||||
|
||||
return meaningful_text.strip()
|
||||
|
||||
@property
|
||||
def is_actually_scrollable(self) -> bool:
|
||||
"""
|
||||
@@ -677,6 +711,7 @@ class SerializedDOMState:
|
||||
|
||||
selector_map: DOMSelectorMap
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='llm_representation')
|
||||
def llm_representation(
|
||||
self,
|
||||
include_attributes: list[str] | None = None,
|
||||
|
||||
@@ -379,6 +379,8 @@ class Registry(Generic[Context]):
|
||||
raise RuntimeError(str(e)) from e
|
||||
else:
|
||||
raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
|
||||
except TimeoutError as e:
|
||||
raise RuntimeError(f'Error executing action {action_name} due to timeout.') from e
|
||||
except Exception as e:
|
||||
raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
|
||||
|
||||
|
||||
@@ -65,26 +65,19 @@ Context = TypeVar('Context')
|
||||
T = TypeVar('T', bound=BaseModel)
|
||||
|
||||
|
||||
def extract_llm_error_message(error: Exception) -> str:
|
||||
"""
|
||||
Extract the clean error message from an exception that may contain <llm_error_msg> tags.
|
||||
|
||||
If the tags are found, returns the content between them.
|
||||
Otherwise, returns the original error string.
|
||||
"""
|
||||
import re
|
||||
|
||||
error_str = str(error)
|
||||
|
||||
# Look for content between <llm_error_msg> tags
|
||||
pattern = r'<llm_error_msg>(.*?)</llm_error_msg>'
|
||||
match = re.search(pattern, error_str, re.DOTALL)
|
||||
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
# Fallback: return the original error string
|
||||
return error_str
|
||||
def handle_browser_error(e: BrowserError) -> ActionResult:
|
||||
if e.long_term_memory is not None:
|
||||
if e.short_term_memory is not None:
|
||||
return ActionResult(
|
||||
extracted_content=e.short_term_memory, error=e.long_term_memory, include_extracted_content_only_once=True
|
||||
)
|
||||
else:
|
||||
return ActionResult(error=e.long_term_memory)
|
||||
# Fallback to original error handling if long_term_memory is None
|
||||
logger.warning(
|
||||
'⚠️ A BrowserError was raised without long_term_memory - always set long_term_memory when raising BrowserError to propagate right messages to LLM.'
|
||||
)
|
||||
raise e
|
||||
|
||||
|
||||
class Tools(Generic[Context]):
|
||||
@@ -177,11 +170,10 @@ class Tools(Generic[Context]):
|
||||
memory = f"Searched Google for '{params.query}'"
|
||||
msg = f'🔍 {memory}'
|
||||
logger.info(msg)
|
||||
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
|
||||
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to search Google: {e}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
return ActionResult(error=f'Failed to search Google for "{params.query}": {clean_msg}')
|
||||
return ActionResult(error=f'Failed to search Google for "{params.query}": {str(e)}')
|
||||
|
||||
@self.registry.action(
|
||||
'Navigate to URL, set new_tab=True to open in new tab, False to navigate in current tab', param_model=GoToUrlAction
|
||||
@@ -201,12 +193,11 @@ class Tools(Generic[Context]):
|
||||
msg = f'🔗 {memory}'
|
||||
|
||||
logger.info(msg)
|
||||
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=memory)
|
||||
return ActionResult(extracted_content=msg, long_term_memory=memory)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
# Always log the actual error first for debugging
|
||||
browser_session.logger.error(f'❌ Navigation failed: {error_msg}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
|
||||
# Check if it's specifically a RuntimeError about CDP client
|
||||
if isinstance(e, RuntimeError) and 'CDP client not initialized' in error_msg:
|
||||
@@ -223,12 +214,12 @@ class Tools(Generic[Context]):
|
||||
'net::',
|
||||
]
|
||||
):
|
||||
site_unavailable_msg = f'Site unavailable: {params.url} - {error_msg}'
|
||||
browser_session.logger.warning(f'⚠️ {site_unavailable_msg}')
|
||||
site_unavailable_msg = f'Navigation failed - site unavailable: {params.url}'
|
||||
browser_session.logger.warning(f'⚠️ {site_unavailable_msg} - {error_msg}')
|
||||
return ActionResult(error=site_unavailable_msg)
|
||||
else:
|
||||
# Return error in ActionResult instead of re-raising
|
||||
return ActionResult(error=f'Navigation failed: {clean_msg}')
|
||||
return ActionResult(error=f'Navigation failed: {str(e)}')
|
||||
|
||||
@self.registry.action('Go back', param_model=NoParamsAction)
|
||||
async def go_back(_: NoParamsAction, browser_session: BrowserSession):
|
||||
@@ -241,8 +232,7 @@ class Tools(Generic[Context]):
|
||||
return ActionResult(extracted_content=memory)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to dispatch GoBackEvent: {type(e).__name__}: {e}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
error_msg = f'Failed to go back: {clean_msg}'
|
||||
error_msg = f'Failed to go back: {str(e)}'
|
||||
return ActionResult(error=error_msg)
|
||||
|
||||
@self.registry.action(
|
||||
@@ -285,23 +275,18 @@ class Tools(Generic[Context]):
|
||||
# Wait for handler to complete and get any exception or metadata
|
||||
click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
|
||||
memory = f'Clicked element with index {params.index}'
|
||||
if params.while_holding_ctrl:
|
||||
memory += ' and opened in new tab'
|
||||
msg = f'🖱️ {memory}'
|
||||
logger.info(msg)
|
||||
|
||||
# Include click coordinates in metadata if available
|
||||
return ActionResult(
|
||||
extracted_content=memory,
|
||||
include_in_memory=True,
|
||||
long_term_memory=memory,
|
||||
metadata=click_metadata if isinstance(click_metadata, dict) else None,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to execute ClickElementEvent: {type(e).__name__}: {e}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
error_msg = f'Failed to click element {params.index}: {clean_msg}'
|
||||
|
||||
# If it's a select dropdown error, automatically get the dropdown options
|
||||
if 'dropdown' in str(e) and node:
|
||||
except BrowserError as e:
|
||||
if 'Cannot click on <select> elements.' in str(e):
|
||||
try:
|
||||
return await get_dropdown_options(
|
||||
params=GetDropdownOptionsAction(index=params.index), browser_session=browser_session
|
||||
@@ -311,6 +296,9 @@ class Tools(Generic[Context]):
|
||||
f'Failed to get dropdown options as shortcut during click_element_by_index on dropdown: {type(dropdown_error).__name__}: {dropdown_error}'
|
||||
)
|
||||
|
||||
return handle_browser_error(e)
|
||||
except Exception as e:
|
||||
error_msg = f'Failed to click element {params.index}: {str(e)}'
|
||||
return ActionResult(error=error_msg)
|
||||
|
||||
@self.registry.action(
|
||||
@@ -336,10 +324,11 @@ class Tools(Generic[Context]):
|
||||
# Include input coordinates in metadata if available
|
||||
return ActionResult(
|
||||
extracted_content=msg,
|
||||
include_in_memory=True,
|
||||
long_term_memory=f"Input '{params.text}' into element {params.index}.",
|
||||
metadata=input_metadata if isinstance(input_metadata, dict) else None,
|
||||
)
|
||||
except BrowserError as e:
|
||||
return handle_browser_error(e)
|
||||
except Exception as e:
|
||||
# Log the full error for debugging
|
||||
logger.error(f'Failed to dispatch TypeTextEvent: {type(e).__name__}: {e}')
|
||||
@@ -370,27 +359,28 @@ class Tools(Generic[Context]):
|
||||
if not browser_session.is_local:
|
||||
pass
|
||||
else:
|
||||
raise BrowserError(
|
||||
f'File path {params.path} is not available. Must be in available_file_paths, downloaded_files, or a file managed by file_system.'
|
||||
)
|
||||
msg = f'File path {params.path} is not available. Upload files must be in available_file_paths, downloaded_files, or a file managed by file_system.'
|
||||
logger.error(f'❌ {msg}')
|
||||
return ActionResult(error=msg)
|
||||
else:
|
||||
# If browser is remote, allow passing a remote-accessible absolute path
|
||||
if not browser_session.is_local:
|
||||
pass
|
||||
else:
|
||||
raise BrowserError(
|
||||
f'File path {params.path} is not available. Must be in available_file_paths or downloaded_files.'
|
||||
)
|
||||
msg = f'File path {params.path} is not available. Upload files must be in available_file_paths, downloaded_files, or a file managed by file_system.'
|
||||
raise BrowserError(message=msg, long_term_memory=msg)
|
||||
|
||||
# For local browsers, ensure the file exists on the local filesystem
|
||||
if browser_session.is_local:
|
||||
if not os.path.exists(params.path):
|
||||
raise BrowserError(f'File {params.path} does not exist')
|
||||
msg = f'File {params.path} does not exist'
|
||||
return ActionResult(error=msg)
|
||||
|
||||
# Get the selector map to find the node
|
||||
selector_map = await browser_session.get_selector_map()
|
||||
if params.index not in selector_map:
|
||||
raise BrowserError(f'Element with index {params.index} not found in selector map')
|
||||
msg = f'Element with index {params.index} does not exist.'
|
||||
return ActionResult(error=msg)
|
||||
|
||||
node = selector_map[params.index]
|
||||
|
||||
@@ -486,7 +476,6 @@ class Tools(Generic[Context]):
|
||||
logger.info(f'📁 {msg}')
|
||||
return ActionResult(
|
||||
extracted_content=msg,
|
||||
include_in_memory=True,
|
||||
long_term_memory=f'Uploaded file {params.path} to element {params.index}',
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -499,12 +488,7 @@ class Tools(Generic[Context]):
|
||||
async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession):
|
||||
# Dispatch switch tab event
|
||||
try:
|
||||
if params.tab_id:
|
||||
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
||||
elif params.url:
|
||||
target_id = await browser_session.get_target_id_from_url(params.url)
|
||||
else:
|
||||
target_id = await browser_session.get_most_recently_opened_target_id()
|
||||
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
||||
|
||||
event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
|
||||
await event
|
||||
@@ -512,11 +496,10 @@ class Tools(Generic[Context]):
|
||||
assert new_target_id, 'SwitchTabEvent did not return a TargetID for the new tab that was switched to'
|
||||
memory = f'Switched to Tab with ID {new_target_id[-4:]}'
|
||||
logger.info(f'🔄 {memory}')
|
||||
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
|
||||
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to switch tab: {type(e).__name__}: {e}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
return ActionResult(error=f'Failed to switch to tab {params.tab_id or params.url}: {clean_msg}')
|
||||
return ActionResult(error=f'Failed to switch to tab {params.tab_id}.')
|
||||
|
||||
@self.registry.action('Close an existing tab', param_model=CloseTabAction)
|
||||
async def close_tab(params: CloseTabAction, browser_session: BrowserSession):
|
||||
@@ -535,13 +518,11 @@ class Tools(Generic[Context]):
|
||||
logger.info(f'🗑️ {memory}')
|
||||
return ActionResult(
|
||||
extracted_content=memory,
|
||||
include_in_memory=True,
|
||||
long_term_memory=memory,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to close tab: {e}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
return ActionResult(error=f'Failed to close tab {params.tab_id}: {clean_msg}')
|
||||
return ActionResult(error=f'Failed to close tab {params.tab_id}.')
|
||||
|
||||
# Content Actions
|
||||
|
||||
@@ -697,11 +678,10 @@ Provide the extracted information in a clear, structured format."""
|
||||
|
||||
msg = f'🔍 {long_term_memory}'
|
||||
logger.info(msg)
|
||||
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=long_term_memory)
|
||||
return ActionResult(extracted_content=msg, long_term_memory=long_term_memory)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to dispatch ScrollEvent: {type(e).__name__}: {e}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
error_msg = f'Failed to scroll: {clean_msg}'
|
||||
error_msg = 'Failed to execute scroll action.'
|
||||
return ActionResult(error=error_msg)
|
||||
|
||||
@self.registry.action(
|
||||
@@ -717,11 +697,10 @@ Provide the extracted information in a clear, structured format."""
|
||||
memory = f'Sent keys: {params.keys}'
|
||||
msg = f'⌨️ {memory}'
|
||||
logger.info(msg)
|
||||
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
|
||||
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to dispatch SendKeysEvent: {type(e).__name__}: {e}')
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
error_msg = f'Failed to send keys: {clean_msg}'
|
||||
error_msg = f'Failed to send keys: {str(e)}'
|
||||
return ActionResult(error=error_msg)
|
||||
|
||||
@self.registry.action(
|
||||
@@ -737,14 +716,13 @@ Provide the extracted information in a clear, structured format."""
|
||||
memory = f'Scrolled to text: {text}'
|
||||
msg = f'🔍 {memory}'
|
||||
logger.info(msg)
|
||||
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
|
||||
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
||||
except Exception as e:
|
||||
# Text not found
|
||||
msg = f"Text '{text}' not found or not visible on page"
|
||||
logger.info(msg)
|
||||
return ActionResult(
|
||||
extracted_content=msg,
|
||||
include_in_memory=True,
|
||||
long_term_memory=f"Tried scrolling to text '{text}' but it was not found",
|
||||
)
|
||||
|
||||
@@ -762,7 +740,6 @@ Provide the extracted information in a clear, structured format."""
|
||||
raise ValueError(f'Element index {params.index} not found in DOM')
|
||||
|
||||
# Dispatch GetDropdownOptionsEvent to the event handler
|
||||
import json
|
||||
|
||||
event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node))
|
||||
dropdown_data = await event.event_result(timeout=3.0, raise_if_none=True, raise_if_any=True)
|
||||
@@ -770,14 +747,10 @@ Provide the extracted information in a clear, structured format."""
|
||||
if not dropdown_data:
|
||||
raise ValueError('Failed to get dropdown options - no data returned')
|
||||
|
||||
# Extract the message from the returned data
|
||||
msg = dropdown_data.get('message', '')
|
||||
options_count = len(json.loads(dropdown_data.get('options', '[]'))) # Parse the string back to list to get count
|
||||
|
||||
# Use structured memory from the handler
|
||||
return ActionResult(
|
||||
extracted_content=msg,
|
||||
include_in_memory=True,
|
||||
long_term_memory=f'Found {options_count} dropdown options for index {params.index}',
|
||||
extracted_content=dropdown_data['short_term_memory'],
|
||||
long_term_memory=dropdown_data['long_term_memory'],
|
||||
include_extracted_content_only_once=True,
|
||||
)
|
||||
|
||||
@@ -801,14 +774,28 @@ Provide the extracted information in a clear, structured format."""
|
||||
if not selection_data:
|
||||
raise ValueError('Failed to select dropdown option - no data returned')
|
||||
|
||||
# Extract the message from the returned data
|
||||
msg = selection_data.get('message', f'Selected option: {params.text}')
|
||||
|
||||
return ActionResult(
|
||||
extracted_content=msg,
|
||||
include_in_memory=True,
|
||||
long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}",
|
||||
)
|
||||
# Check if the selection was successful
|
||||
if selection_data.get('success') == 'true':
|
||||
# Extract the message from the returned data
|
||||
msg = selection_data.get('message', f'Selected option: {params.text}')
|
||||
return ActionResult(
|
||||
extracted_content=msg,
|
||||
include_in_memory=True,
|
||||
long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}",
|
||||
)
|
||||
else:
|
||||
# Handle structured error response
|
||||
# TODO: raise BrowserError instead of returning ActionResult
|
||||
if 'short_term_memory' in selection_data and 'long_term_memory' in selection_data:
|
||||
return ActionResult(
|
||||
extracted_content=selection_data['short_term_memory'],
|
||||
long_term_memory=selection_data['long_term_memory'],
|
||||
include_extracted_content_only_once=True,
|
||||
)
|
||||
else:
|
||||
# Fallback to regular error
|
||||
error_msg = selection_data.get('error', f'Failed to select option: {params.text}')
|
||||
return ActionResult(error=error_msg)
|
||||
|
||||
# File System Actions
|
||||
@self.registry.action(
|
||||
@@ -831,7 +818,7 @@ Provide the extracted information in a clear, structured format."""
|
||||
else:
|
||||
result = await file_system.write_file(file_name, content)
|
||||
logger.info(f'💾 {result}')
|
||||
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
|
||||
return ActionResult(extracted_content=result, long_term_memory=result)
|
||||
|
||||
@self.registry.action(
|
||||
'Replace old_str with new_str in file_name. old_str must exactly match the string to replace in original text. Recommended tool to mark completed items in todo.md or change specific contents in a file.'
|
||||
@@ -839,7 +826,7 @@ Provide the extracted information in a clear, structured format."""
|
||||
async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem):
|
||||
result = await file_system.replace_file_str(file_name, old_str, new_str)
|
||||
logger.info(f'💾 {result}')
|
||||
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
|
||||
return ActionResult(extracted_content=result, long_term_memory=result)
|
||||
|
||||
@self.registry.action('Read file_name from file system')
|
||||
async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem):
|
||||
@@ -866,7 +853,6 @@ Provide the extracted information in a clear, structured format."""
|
||||
logger.info(f'💾 {memory}')
|
||||
return ActionResult(
|
||||
extracted_content=result,
|
||||
include_in_memory=True,
|
||||
long_term_memory=memory,
|
||||
include_extracted_content_only_once=True,
|
||||
)
|
||||
@@ -1001,12 +987,16 @@ Provide the extracted information in a clear, structured format."""
|
||||
sensitive_data=sensitive_data,
|
||||
available_file_paths=available_file_paths,
|
||||
)
|
||||
except BrowserError as e:
|
||||
logger.error(f'❌ Action {action_name} failed with BrowserError: {str(e)}')
|
||||
result = handle_browser_error(e)
|
||||
except TimeoutError as e:
|
||||
logger.error(f'❌ Action {action_name} failed with TimeoutError: {str(e)}')
|
||||
result = ActionResult(error=f'{action_name} was not executed due to timeout.')
|
||||
except Exception as e:
|
||||
# Log the original exception with traceback for observability
|
||||
logger.error(f"Action '{action_name}' failed")
|
||||
# Extract clean error message from llm_error_msg tags if present
|
||||
clean_msg = extract_llm_error_message(e)
|
||||
result = ActionResult(error=clean_msg)
|
||||
logger.error(f"Action '{action_name}' failed with error: {str(e)}")
|
||||
result = ActionResult(error=str(e))
|
||||
|
||||
if Laminar is not None:
|
||||
Laminar.set_span_output(result)
|
||||
|
||||
@@ -43,15 +43,10 @@ class StructuredOutputAction(BaseModel, Generic[T]):
|
||||
|
||||
|
||||
class SwitchTabAction(BaseModel):
|
||||
url: str | None = Field(
|
||||
default=None,
|
||||
description='URL or URL substring of the tab to switch to, if not provided, the tab_id or most recently opened tab will be used',
|
||||
)
|
||||
tab_id: str | None = Field(
|
||||
default=None,
|
||||
tab_id: str = Field(
|
||||
min_length=4,
|
||||
max_length=4,
|
||||
description='exact 4 character Tab ID to match instead of URL, prefer using this if known',
|
||||
description='Last 4 chars of TargetID',
|
||||
) # last 4 chars of TargetID
|
||||
|
||||
|
||||
|
||||
@@ -129,31 +129,6 @@
|
||||
{
|
||||
"tab": "Cloud",
|
||||
"versions": [
|
||||
{
|
||||
"version": "v2",
|
||||
"groups": [
|
||||
{
|
||||
"group": "Get Started",
|
||||
"pages": [
|
||||
"cloud/v2/quickstart",
|
||||
"cloud/v2/python-quickstart",
|
||||
"cloud/v2/node-quickstart"
|
||||
]
|
||||
},
|
||||
{
|
||||
"group": "Platform",
|
||||
"pages": [
|
||||
"cloud/v1/pricing",
|
||||
"cloud/v1/n8n-browser-use-integration",
|
||||
"cloud/v1/search"
|
||||
]
|
||||
},
|
||||
{
|
||||
"group": "REST API reference",
|
||||
"openapi": "https://app.stainless.com/api/spec/documented/browser-use/openapi.documented.yml"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"version": "v1",
|
||||
"groups": [
|
||||
@@ -180,6 +155,27 @@
|
||||
"openapi": "https://api.browser-use.com/api/v1/openapi.json"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"version": "v2",
|
||||
"groups": [
|
||||
{
|
||||
"group": "Get Started",
|
||||
"pages": [
|
||||
"cloud/v2/quickstart",
|
||||
"cloud/v2/python-quickstart",
|
||||
"cloud/v2/node-quickstart"
|
||||
]
|
||||
},
|
||||
{
|
||||
"group": "Platform",
|
||||
"pages": [
|
||||
"cloud/v1/pricing",
|
||||
"cloud/v1/n8n-browser-use-integration",
|
||||
"cloud/v1/search"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ dependencies = [
|
||||
"aiofiles>=24.1.0",
|
||||
"aiohttp==3.12.15",
|
||||
"anyio>=4.9.0",
|
||||
"bubus>=1.5.4",
|
||||
"bubus>=1.5.6",
|
||||
"google-api-core>=2.25.0",
|
||||
"httpx>=0.28.1",
|
||||
"markdownify==1.1.0",
|
||||
|
||||
@@ -17,11 +17,7 @@ import aiofiles
|
||||
import yaml
|
||||
from pydantic import BaseModel
|
||||
|
||||
from browser_use.agent.service import Agent
|
||||
from browser_use.agent.views import AgentHistoryList
|
||||
from browser_use.browser.profile import BrowserProfile
|
||||
from browser_use.browser.session import BrowserSession
|
||||
from browser_use.llm import ChatOpenAI
|
||||
from browser_use import Agent, AgentHistoryList, BrowserProfile, BrowserSession, ChatOpenAI
|
||||
from browser_use.llm.messages import UserMessage
|
||||
|
||||
# --- CONFIG ---
|
||||
|
||||
@@ -185,11 +185,11 @@ class TestClickElementEvent:
|
||||
# Verify the result structure
|
||||
assert isinstance(result, ActionResult), 'Result should be an ActionResult instance'
|
||||
assert result.error is None, f'Expected no error but got: {result.error}'
|
||||
|
||||
result_text = result.extracted_content or result.long_term_memory
|
||||
# Core logic validation: Verify click was successful
|
||||
assert result.extracted_content is not None
|
||||
assert f'Clicked element with index {button_index}' in result.extracted_content, (
|
||||
f'Expected click confirmation in result content, got: {result.extracted_content}'
|
||||
assert result_text is not None
|
||||
assert f'Clicked element with index {button_index}' in result_text, (
|
||||
f'Expected click confirmation in result content, got: {result_text}'
|
||||
)
|
||||
# Note: The click action doesn't include button text in the result, only the index
|
||||
|
||||
@@ -260,7 +260,11 @@ class TestClickElementEvent:
|
||||
|
||||
# Verify the result
|
||||
assert isinstance(result, ActionResult)
|
||||
assert result.extracted_content is not None
|
||||
result_text = result.extracted_content or result.long_term_memory
|
||||
assert result_text is not None
|
||||
assert f'Clicked element with index {link_index}' in result_text, (
|
||||
f'Expected click confirmation in result content, got: {result_text}'
|
||||
)
|
||||
|
||||
# Verify that a new tab was opened
|
||||
tabs = await browser_session.get_tabs()
|
||||
|
||||
@@ -104,7 +104,6 @@ class TestScrollActions:
|
||||
assert result.extracted_content is not None
|
||||
assert 'Scrolled down' in result.extracted_content
|
||||
assert 'the page' in result.extracted_content
|
||||
assert result.include_in_memory is True
|
||||
|
||||
# Test 2: Basic page scroll up
|
||||
scroll_up_action = {'scroll': ScrollAction(down=False, num_pages=0.5)}
|
||||
@@ -123,7 +122,7 @@ class TestScrollActions:
|
||||
# This should fail with error about element not found
|
||||
assert isinstance(result, ActionResult)
|
||||
assert result.error is not None, 'Expected error for invalid element index'
|
||||
assert 'Element index 999 not found' in result.error or 'Failed to scroll' in result.error
|
||||
assert 'Element index 999 not found' in result.error or 'Failed to execute scroll' in result.error
|
||||
|
||||
# Test 4: Model parameter validation
|
||||
scroll_with_index = ScrollAction(down=True, num_pages=1.0, frame_element_index=5)
|
||||
|
||||
@@ -394,10 +394,10 @@ class TestScreenshotEventSystem:
|
||||
|
||||
# Test the NEW event-driven path: direct event dispatching
|
||||
event = browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False))
|
||||
screenshot_result = (await event.event_result()) or {}
|
||||
assert screenshot_result.get('screenshot')
|
||||
assert isinstance(screenshot_result['screenshot'], str)
|
||||
assert len(base64.b64decode(screenshot_result['screenshot'])) > 5000
|
||||
screenshot_b64 = await event.event_result()
|
||||
assert screenshot_b64 is not None
|
||||
assert isinstance(screenshot_b64, str)
|
||||
assert len(base64.b64decode(screenshot_b64)) > 5000
|
||||
|
||||
finally:
|
||||
await browser_session.kill()
|
||||
|
||||
Reference in New Issue
Block a user