Merge branch 'main' into os-font-fix

This commit is contained in:
Mert Unsal
2025-08-31 00:48:49 +02:00
committed by GitHub
24 changed files with 1653 additions and 769 deletions

View File

@@ -172,6 +172,11 @@ jobs:
const score = `${passed}/${total}`;
const percentage = Math.round((passed / total) * 100);
// Fail the workflow if 0% pass rate
if (percentage === 0) {
core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`);
}
// Create detailed table
let tableRows = '';
detailedResults.forEach(result => {

View File

@@ -212,10 +212,16 @@ class MessageManager:
# Build the history item
if model_output is None:
# Only add error history item if we have a valid step number
if step_number is not None and step_number > 0:
history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
self.state.agent_history_items.append(history_item)
# Add history item for initial actions (step 0) or errors (step > 0)
if step_number is not None:
if step_number == 0 and action_results:
# Step 0 with initial action results
history_item = HistoryItem(step_number=step_number, action_results=action_results)
self.state.agent_history_items.append(history_item)
elif step_number > 0:
# Error case for steps > 0
history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
self.state.agent_history_items.append(history_item)
else:
history_item = HistoryItem(
step_number=step_number,

View File

@@ -270,8 +270,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Action setup
self._setup_action_models()
self._set_browser_use_version_and_source(source)
self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
initial_url = None
# only load url if no initial actions are provided
if self.directly_open_url and not self.state.follow_up_task and not initial_actions:
initial_url = self._extract_url_from_task(self.task)
if initial_url:
self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...')
initial_actions = [{'go_to_url': {'url': initial_url, 'new_tab': False}}]
self.initial_url = initial_url
self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
# Verify we can connect to the model
self._verify_and_setup_llm()
@@ -588,7 +599,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if hasattr(self, 'cloud_sync') and self.cloud_sync and self.enable_cloud_sync:
self.eventbus.on('*', self.cloud_sync.handle_event)
@observe_debug(ignore_input=True, ignore_output=True, name='_raise_if_stopped_or_paused')
async def _raise_if_stopped_or_paused(self) -> None:
"""Utility function that raises an InterruptedError if the agent is stopped or paused."""
@@ -635,14 +645,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
# Always take screenshots for all steps
# Use caching based on directly_open_url setting - if directly_open_url is False, don't use cached state
is_first_step = self.state.n_steps in (0, 1)
use_cache = is_first_step and self.directly_open_url
self.logger.debug(f'📸 Requesting browser state with include_screenshot=True, cached={use_cache}')
self.logger.debug('📸 Requesting browser state with include_screenshot=True')
browser_state_summary = await self.browser_session.get_browser_state_summary(
cache_clickable_elements_hashes=True,
include_screenshot=True, # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway)
cached=use_cache,
include_recent_events=self.include_recent_events,
)
if browser_state_summary.screenshot:
@@ -1160,7 +1166,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
unique_urls = list(set(found_urls))
# If multiple URLs found, skip directly_open_urling
if len(unique_urls) > 1:
self.logger.debug(f'📍 Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity')
self.logger.debug(f'Multiple URLs found ({len(found_urls)}), skipping directly_open_url to avoid ambiguity')
return None
# If exactly one URL found, return it
@@ -1239,45 +1245,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.debug('🔧 Browser session started with watchdogs attached')
# Check if task contains a URL and add it as an initial action (only if directly_open_url is enabled)
if self.directly_open_url and not self.state.follow_up_task:
initial_url = self._extract_url_from_task(self.task)
if initial_url:
self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...')
# Ensure browser focus is properly established before executing initial actions
if self.browser_session and self.browser_session.agent_focus:
self.logger.debug(f'🎯 Browser focus established on target: {self.browser_session.agent_focus.target_id[-4:]}')
else:
self.logger.warning('⚠️ No browser focus established, may cause navigation issues')
# Create a go_to_url action for the initial URL
go_to_url_action = {
'go_to_url': {
'url': initial_url,
'new_tab': False, # Navigate in current tab
}
}
# Add to initial_actions or create new list if none exist
if self.initial_actions:
# Convert back to dict format, prepend URL navigation, then convert back
initial_actions_dicts = []
for action in self.initial_actions:
action_data = action.model_dump(exclude_unset=True)
initial_actions_dicts.append(action_data)
# Prepend the go_to_url action
initial_actions_dicts = [go_to_url_action] + initial_actions_dicts
# Convert back to ActionModel instances
self.initial_actions = self._convert_initial_actions(initial_actions_dicts)
else:
# Create new initial_actions with just the go_to_url
self.initial_actions = self._convert_initial_actions([go_to_url_action])
self.logger.debug(f'✅ Added navigation to {initial_url} as initial action')
# Execute initial actions if provided
if self.initial_actions and not self.state.follow_up_task:
self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...')
result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
self.state.last_result = result
self.logger.debug('✅ Initial actions completed')
await self._execute_initial_actions()
self.logger.debug(f'🔄 Starting main execution loop with max {max_steps} steps...')
for step in range(max_steps):
@@ -1519,6 +1493,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
new_element_hashes = {e.parent_branch_hash() for e in new_selector_map.values()}
if check_for_new_elements and not new_element_hashes.issubset(cached_element_hashes):
# next action requires index but there are new elements on the page
# log difference in len debug
self.logger.debug(f'New elements: {abs(len(new_element_hashes) - len(cached_element_hashes))}')
remaining_actions_str = get_remaining_actions_str(actions, i)
msg = f'Something new appeared after action {i} / {total_actions}: actions {remaining_actions_str} were not executed'
logger.info(msg)
@@ -1653,6 +1629,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
return results
async def _execute_initial_actions(self) -> None:
# Execute initial actions if provided
if self.initial_actions and not self.state.follow_up_task:
self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...')
result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
# update result 1 to mention that its was automatically loaded
if result and self.initial_url and result[0].long_term_memory:
result[0].long_term_memory = f'Found initial url and automatically loaded it. {result[0].long_term_memory}'
self.state.last_result = result
self.logger.debug('Initial actions completed')
async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]:
"""Execute a single step from history with element validation"""
assert self.browser_session is not None, 'BrowserSession is not set up'

View File

@@ -583,9 +583,14 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
# --- UI/viewport/DOM ---
highlight_elements: bool = Field(default=True, description='Highlight interactive elements on the page.')
filter_highlight_ids: bool = Field(
default=True, description='Only show element IDs in highlights if llm_representation is less than 10 characters.'
)
# --- Downloads ---
auto_download_pdfs: bool = Field(default=True, description='Automatically download PDFs when navigating to PDF viewer pages.')
auto_download_pdfs: bool = Field(
default=False, description='Automatically download PDFs when navigating to PDF viewer pages.'
)
profile_directory: str = 'Default' # e.g. 'Profile 1', 'Profile 2', 'Custom Profile', etc.

View File

@@ -0,0 +1,476 @@
"""Python-based highlighting system for drawing bounding boxes on screenshots.
This module replaces JavaScript-based highlighting with fast Python image processing
to draw bounding boxes around interactive elements directly on screenshots.
"""
import base64
import io
import logging
from PIL import Image, ImageDraw, ImageFont
from browser_use.dom.views import DOMSelectorMap
from browser_use.observability import observe_debug
from browser_use.utils import time_execution_async
logger = logging.getLogger(__name__)
# Color scheme for different element types
ELEMENT_COLORS = {
'button': '#FF6B6B', # Red for buttons
'input': '#4ECDC4', # Teal for inputs
'select': '#45B7D1', # Blue for dropdowns
'a': '#96CEB4', # Green for links
'textarea': '#FF8C42', # Orange for text areas (was yellow, now more visible)
'default': '#DDA0DD', # Light purple for other interactive elements
}
# Element type mappings
ELEMENT_TYPE_MAP = {
'button': 'button',
'input': 'input',
'select': 'select',
'a': 'a',
'textarea': 'textarea',
}
def get_element_color(tag_name: str, element_type: str | None = None) -> str:
"""Get color for element based on tag name and type."""
# Check input type first
if tag_name == 'input' and element_type:
if element_type in ['button', 'submit']:
return ELEMENT_COLORS['button']
# Use tag-based color
return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default'])
def should_show_index_overlay(element_index: int | None) -> bool:
"""Determine if index overlay should be shown."""
return element_index is not None
def draw_enhanced_bounding_box_with_text(
draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues
bbox: tuple[int, int, int, int],
color: str,
text: str | None = None,
font: ImageFont.FreeTypeFont | None = None,
element_type: str = 'div',
image_size: tuple[int, int] = (2000, 1500),
) -> None:
"""Draw an enhanced bounding box with much bigger index containers and dashed borders."""
x1, y1, x2, y2 = bbox
# Draw dashed bounding box with pattern: 1 line, 2 spaces, 1 line, 2 spaces...
dash_length = 4
gap_length = 8
line_width = 2
# Helper function to draw dashed line
def draw_dashed_line(start_x, start_y, end_x, end_y):
if start_x == end_x: # Vertical line
y = start_y
while y < end_y:
dash_end = min(y + dash_length, end_y)
draw.line([(start_x, y), (start_x, dash_end)], fill=color, width=line_width)
y += dash_length + gap_length
else: # Horizontal line
x = start_x
while x < end_x:
dash_end = min(x + dash_length, end_x)
draw.line([(x, start_y), (dash_end, start_y)], fill=color, width=line_width)
x += dash_length + gap_length
# Draw dashed rectangle
draw_dashed_line(x1, y1, x2, y1) # Top
draw_dashed_line(x2, y1, x2, y2) # Right
draw_dashed_line(x2, y2, x1, y2) # Bottom
draw_dashed_line(x1, y2, x1, y1) # Left
# Draw much bigger index overlay if we have index text
if text:
try:
# Scale font size based on image dimensions for consistent appearance across viewports
img_width, img_height = image_size
# Base font size scales with viewport width (36px for 1200px viewport)
base_font_size = max(16, min(48, int(img_width * 0.03))) # 3% of viewport width
big_font = None
try:
big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', base_font_size)
except OSError:
try:
big_font = ImageFont.truetype('arial.ttf', base_font_size)
except OSError:
# Try system fonts on different platforms
try:
big_font = ImageFont.truetype('Arial Bold.ttf', base_font_size)
except OSError:
big_font = font # Fallback to original font
# Get text size with bigger font
if big_font:
bbox_text = draw.textbbox((0, 0), text, font=big_font)
text_width = bbox_text[2] - bbox_text[0]
text_height = bbox_text[3] - bbox_text[1]
else:
# Fallback for default font
bbox_text = draw.textbbox((0, 0), text)
text_width = bbox_text[2] - bbox_text[0]
text_height = bbox_text[3] - bbox_text[1]
# Scale padding based on viewport size for consistent appearance
padding = max(4, int(img_width * 0.005)) # 0.5% of viewport width
element_width = x2 - x1
element_height = y2 - y1
# Container dimensions
container_width = text_width + padding * 2
container_height = text_height + padding * 2
# Position in top-left corner (inside if fits, outside if too small)
if element_width >= container_width and element_height >= container_height:
# Place inside top-left corner
bg_x1 = x1 + 2 # Small offset from edge
bg_y1 = y1 + 2
else:
# Place outside top-left corner
bg_x1 = x1
bg_y1 = max(0, y1 - container_height)
bg_x2 = bg_x1 + container_width
bg_y2 = bg_y1 + container_height
# Center the number within the index box with proper baseline handling
text_x = bg_x1 + (container_width - text_width) // 2
# Add extra vertical space to prevent clipping
text_y = bg_y1 + (container_height - text_height) // 2 - bbox_text[1] # Subtract top offset
# Ensure container stays within image bounds
img_width, img_height = image_size
if bg_x1 < 0:
offset = -bg_x1
bg_x1 += offset
bg_x2 += offset
text_x += offset
if bg_y1 < 0:
offset = -bg_y1
bg_y1 += offset
bg_y2 += offset
text_y += offset
if bg_x2 > img_width:
offset = bg_x2 - img_width
bg_x1 -= offset
bg_x2 -= offset
text_x -= offset
if bg_y2 > img_height:
offset = bg_y2 - img_height
bg_y1 -= offset
bg_y2 -= offset
text_y -= offset
# Draw bigger background rectangle with thicker border
draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=2)
# Draw white text centered in the index box
draw.text((text_x, text_y), text, fill='white', font=big_font or font)
except Exception as e:
logger.debug(f'Failed to draw enhanced text overlay: {e}')
def draw_bounding_box_with_text(
draw, # ImageDraw.Draw - avoiding type annotation due to PIL typing issues
bbox: tuple[int, int, int, int],
color: str,
text: str | None = None,
font: ImageFont.FreeTypeFont | None = None,
) -> None:
"""Draw a bounding box with optional text overlay."""
x1, y1, x2, y2 = bbox
# Draw dashed bounding box
dash_length = 2
gap_length = 6
# Top edge
x = x1
while x < x2:
end_x = min(x + dash_length, x2)
draw.line([(x, y1), (end_x, y1)], fill=color, width=2)
draw.line([(x, y1 + 1), (end_x, y1 + 1)], fill=color, width=2)
x += dash_length + gap_length
# Bottom edge
x = x1
while x < x2:
end_x = min(x + dash_length, x2)
draw.line([(x, y2), (end_x, y2)], fill=color, width=2)
draw.line([(x, y2 - 1), (end_x, y2 - 1)], fill=color, width=2)
x += dash_length + gap_length
# Left edge
y = y1
while y < y2:
end_y = min(y + dash_length, y2)
draw.line([(x1, y), (x1, end_y)], fill=color, width=2)
draw.line([(x1 + 1, y), (x1 + 1, end_y)], fill=color, width=2)
y += dash_length + gap_length
# Right edge
y = y1
while y < y2:
end_y = min(y + dash_length, y2)
draw.line([(x2, y), (x2, end_y)], fill=color, width=2)
draw.line([(x2 - 1, y), (x2 - 1, end_y)], fill=color, width=2)
y += dash_length + gap_length
# Draw index overlay if we have index text
if text:
try:
# Get text size
if font:
bbox_text = draw.textbbox((0, 0), text, font=font)
text_width = bbox_text[2] - bbox_text[0]
text_height = bbox_text[3] - bbox_text[1]
else:
# Fallback for default font
bbox_text = draw.textbbox((0, 0), text)
text_width = bbox_text[2] - bbox_text[0]
text_height = bbox_text[3] - bbox_text[1]
# Smart positioning based on element size
padding = 5
element_width = x2 - x1
element_height = y2 - y1
element_area = element_width * element_height
index_box_area = (text_width + padding * 2) * (text_height + padding * 2)
# Calculate size ratio to determine positioning strategy
size_ratio = element_area / max(index_box_area, 1)
if size_ratio < 4:
# Very small elements: place outside in bottom-right corner
text_x = x2 + padding
text_y = y2 - text_height
# Ensure it doesn't go off screen
text_x = min(text_x, 1200 - text_width - padding)
text_y = max(text_y, 0)
elif size_ratio < 16:
# Medium elements: place in bottom-right corner inside
text_x = x2 - text_width - padding
text_y = y2 - text_height - padding
else:
# Large elements: place in center
text_x = x1 + (element_width - text_width) // 2
text_y = y1 + (element_height - text_height) // 2
# Ensure text stays within bounds
text_x = max(0, min(text_x, 1200 - text_width))
text_y = max(0, min(text_y, 800 - text_height))
# Draw background rectangle for maximum contrast
bg_x1 = text_x - padding
bg_y1 = text_y - padding
bg_x2 = text_x + text_width + padding
bg_y2 = text_y + text_height + padding
# Use white background with thick black border for maximum visibility
draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill='white', outline='black', width=2)
# Draw bold dark text on light background for best contrast
draw.text((text_x, text_y), text, fill='black', font=font)
except Exception as e:
logger.debug(f'Failed to draw text overlay: {e}')
def process_element_highlight(
element_id: int,
element,
draw,
device_pixel_ratio: float,
font,
filter_highlight_ids: bool,
image_size: tuple[int, int],
) -> None:
"""Process a single element for highlighting."""
try:
# Use absolute_position coordinates directly
if not element.absolute_position:
return
bounds = element.absolute_position
# Scale coordinates from CSS pixels to device pixels for screenshot
# The screenshot is captured at device pixel resolution, but coordinates are in CSS pixels
x1 = int(bounds.x * device_pixel_ratio)
y1 = int(bounds.y * device_pixel_ratio)
x2 = int((bounds.x + bounds.width) * device_pixel_ratio)
y2 = int((bounds.y + bounds.height) * device_pixel_ratio)
# Ensure coordinates are within image bounds
img_width, img_height = image_size
x1 = max(0, min(x1, img_width))
y1 = max(0, min(y1, img_height))
x2 = max(x1, min(x2, img_width))
y2 = max(y1, min(y2, img_height))
# Skip if bounding box is too small or invalid
if x2 - x1 < 2 or y2 - y1 < 2:
return
# Get element color based on type
tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div'
element_type = None
if hasattr(element, 'attributes') and element.attributes:
element_type = element.attributes.get('type')
color = get_element_color(tag_name, element_type)
# Get element index for overlay and apply filtering
element_index = getattr(element, 'element_index', None)
index_text = None
if element_index is not None:
if filter_highlight_ids:
# Use the meaningful text that matches what the LLM sees
meaningful_text = element.get_meaningful_text_for_llm()
# Show ID only if meaningful text is less than 5 characters
if len(meaningful_text) < 5:
index_text = str(element_index)
else:
# Always show ID when filter is disabled
index_text = str(element_index)
# Draw enhanced bounding box with bigger index
draw_enhanced_bounding_box_with_text(draw, (x1, y1, x2, y2), color, index_text, font, tag_name, image_size)
except Exception as e:
logger.debug(f'Failed to draw highlight for element {element_id}: {e}')
@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot')
@time_execution_async('create_highlighted_screenshot')
async def create_highlighted_screenshot(
screenshot_b64: str,
selector_map: DOMSelectorMap,
device_pixel_ratio: float = 1.0,
viewport_offset_x: int = 0,
viewport_offset_y: int = 0,
filter_highlight_ids: bool = True,
) -> str:
"""Create a highlighted screenshot with bounding boxes around interactive elements.
Args:
screenshot_b64: Base64 encoded screenshot
selector_map: Map of interactive elements with their positions
device_pixel_ratio: Device pixel ratio for scaling coordinates
viewport_offset_x: X offset for viewport positioning
viewport_offset_y: Y offset for viewport positioning
Returns:
Base64 encoded highlighted screenshot
"""
try:
# Decode screenshot
screenshot_data = base64.b64decode(screenshot_b64)
image = Image.open(io.BytesIO(screenshot_data)).convert('RGBA')
# Create drawing context
draw = ImageDraw.Draw(image)
# Try to load a font, fall back to default if not available
font = None
try:
font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12)
except OSError:
try:
font = ImageFont.truetype('arial.ttf', 12)
except OSError:
font = None # Use default font
# Process elements sequentially to avoid ImageDraw thread safety issues
# PIL ImageDraw is not thread-safe, so we process elements one by one
for element_id, element in selector_map.items():
process_element_highlight(element_id, element, draw, device_pixel_ratio, font, filter_highlight_ids, image.size)
# Convert back to base64
output_buffer = io.BytesIO()
image.save(output_buffer, format='PNG')
output_buffer.seek(0)
highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements')
return highlighted_b64
except Exception as e:
logger.error(f'Failed to create highlighted screenshot: {e}')
# Return original screenshot on error
return screenshot_b64
async def get_viewport_info_from_cdp(cdp_session) -> tuple[float, int, int]:
"""Get viewport information from CDP session.
Returns:
Tuple of (device_pixel_ratio, scroll_x, scroll_y)
"""
try:
# Get layout metrics which includes viewport info and device pixel ratio
metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
# Extract viewport information
visual_viewport = metrics.get('visualViewport', {})
css_visual_viewport = metrics.get('cssVisualViewport', {})
css_layout_viewport = metrics.get('cssLayoutViewport', {})
# Calculate device pixel ratio
css_width = css_visual_viewport.get('clientWidth', css_layout_viewport.get('clientWidth', 1280.0))
device_width = visual_viewport.get('clientWidth', css_width)
device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0
# Get scroll position in CSS pixels
scroll_x = int(css_visual_viewport.get('pageX', 0))
scroll_y = int(css_visual_viewport.get('pageY', 0))
return float(device_pixel_ratio), scroll_x, scroll_y
except Exception as e:
logger.debug(f'Failed to get viewport info from CDP: {e}')
return 1.0, 0, 0
@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot_async')
@time_execution_async('create_highlighted_screenshot_async')
async def create_highlighted_screenshot_async(
screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None, filter_highlight_ids: bool = True
) -> str:
"""Async wrapper for creating highlighted screenshots.
Args:
screenshot_b64: Base64 encoded screenshot
selector_map: Map of interactive elements
cdp_session: CDP session for getting viewport info
Returns:
Base64 encoded highlighted screenshot
"""
# Get viewport information if CDP session is available
device_pixel_ratio = 1.0
viewport_offset_x = 0
viewport_offset_y = 0
if cdp_session:
try:
device_pixel_ratio, viewport_offset_x, viewport_offset_y = await get_viewport_info_from_cdp(cdp_session)
except Exception as e:
logger.debug(f'Failed to get viewport info from CDP: {e}')
# Create highlighted screenshot with async processing
return await create_highlighted_screenshot(
screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y, filter_highlight_ids
)

View File

@@ -39,7 +39,8 @@ from browser_use.browser.events import (
from browser_use.browser.profile import BrowserProfile, ProxySettings
from browser_use.browser.views import BrowserStateSummary, TabInfo
from browser_use.dom.views import EnhancedDOMTreeNode, TargetInfo
from browser_use.utils import _log_pretty_url, is_new_tab_page
from browser_use.observability import observe_debug
from browser_use.utils import _log_pretty_url, is_new_tab_page, time_execution_async
DEFAULT_BROWSER_PROFILE = BrowserProfile()
@@ -264,6 +265,7 @@ class BrowserSession(BaseModel):
wait_for_network_idle_page_load_time: float | None = None,
wait_between_actions: float | None = None,
highlight_elements: bool | None = None,
filter_highlight_ids: bool | None = None,
auto_download_pdfs: bool | None = None,
profile_directory: str | None = None,
):
@@ -536,6 +538,18 @@ class BrowserSession(BaseModel):
target_id = None
# If new_tab=True but we're already in a new tab, set new_tab=False
if event.new_tab:
try:
current_url = await self.get_current_page_url()
from browser_use.utils import is_new_tab_page
if is_new_tab_page(current_url):
self.logger.debug(f'[on_NavigateToUrlEvent] Already in new tab ({current_url}), setting new_tab=False')
event.new_tab = False
except Exception as e:
self.logger.debug(f'[on_NavigateToUrlEvent] Could not check current URL: {e}')
# check if the url is already open in a tab somewhere that we're not currently on, if so, short-circuit and just switch to it
targets = await self._cdp_get_all_pages()
for target in targets:
@@ -584,10 +598,18 @@ class BrowserSession(BaseModel):
# Use current tab
target_id = target_id or self.agent_focus.target_id
# Activate target (bring to foreground)
await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
# which does this for us:
# self.agent_focus = await self.get_or_create_cdp_session(target_id)
# Only switch tab if we're not already on the target tab
if self.agent_focus is None or self.agent_focus.target_id != target_id:
self.logger.debug(
f'[on_NavigateToUrlEvent] Switching to target tab {target_id[-4:]} (current: {self.agent_focus.target_id[-4:] if self.agent_focus else "none"})'
)
# Activate target (bring to foreground)
await self.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
# which does this for us:
# self.agent_focus = await self.get_or_create_cdp_session(target_id)
else:
self.logger.debug(f'[on_NavigateToUrlEvent] Already on target tab {target_id[-4:]}, skipping SwitchTabEvent')
assert self.agent_focus is not None and self.agent_focus.target_id == target_id, (
'Agent focus not updated to new target_id after SwitchTabEvent should have switched to it'
)
@@ -605,8 +627,8 @@ class BrowserSession(BaseModel):
session_id=self.agent_focus.session_id,
)
# Wait a bit to ensure page starts loading
await asyncio.sleep(0.5)
# # Wait a bit to ensure page starts loading
# await asyncio.sleep(0.5)
# Dispatch navigation complete
self.logger.debug(f'Dispatching NavigationCompleteEvent for {event.url} (tab #{target_id[-4:]})')
@@ -678,8 +700,8 @@ class BrowserSession(BaseModel):
"""Handle tab closure - update focus if needed."""
cdp_session = await self.get_or_create_cdp_session(target_id=None, focus=False)
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id})
await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id))
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id})
async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
"""Handle tab closure - update focus if needed."""
@@ -791,6 +813,8 @@ class BrowserSession(BaseModel):
assert self._cdp_client_root is not None, 'CDP client not initialized - browser may not be connected yet'
return self._cdp_client_root
@time_execution_async('get_or_create_cdp_session')
@observe_debug(ignore_input=True, ignore_output=True, name='get_or_create_cdp_session')
async def get_or_create_cdp_session(
self, target_id: TargetID | None = None, focus: bool = True, new_socket: bool | None = None
) -> CDPSession:
@@ -845,6 +869,8 @@ class BrowserSession(BaseModel):
cdp_url=self.cdp_url if should_use_new_socket else None,
)
self._cdp_session_pool[target_id] = session
# log length of _cdp_session_pool
self.logger.debug(f'[get_or_create_cdp_session] new _cdp_session_pool length: {len(self._cdp_session_pool)}')
# Only change agent focus if requested
if focus:
@@ -870,7 +896,7 @@ class BrowserSession(BaseModel):
return self.agent_focus.session_id if self.agent_focus else None
# ========== Helper Methods ==========
@observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_summary')
async def get_browser_state_summary(
self,
cache_clickable_elements_hashes: bool = True,
@@ -1321,6 +1347,7 @@ class BrowserSession(BaseModel):
except Exception as e:
self.logger.debug(f'Skipping proxy auth setup: {type(e).__name__}: {e}')
@observe_debug(ignore_input=True, ignore_output=True, name='get_tabs')
async def get_tabs(self) -> list[TabInfo]:
"""Get information about all open tabs using CDP Target.getTargetInfo for speed."""
tabs = []
@@ -1399,6 +1426,7 @@ class BrowserSession(BaseModel):
return target
return None
@observe_debug(ignore_input=True, ignore_output=True, name='get_current_page_url')
async def get_current_page_url(self) -> str:
"""Get the URL of the current page using CDP."""
target = await self.get_current_target_info()
@@ -1519,6 +1547,9 @@ class BrowserSession(BaseModel):
async def remove_highlights(self) -> None:
"""Remove highlights from the page using CDP."""
if not self.browser_profile.highlight_elements:
return
try:
# Get cached session
cdp_session = await self.get_or_create_cdp_session()

View File

@@ -126,17 +126,42 @@ class BrowserStateHistory:
class BrowserError(Exception):
"""Base class for all browser errors"""
"""Browser error with structured memory for LLM context management.
This exception class provides separate memory contexts for browser actions:
- short_term_memory: Immediate context shown once to the LLM for the next action
- long_term_memory: Persistent error information stored across steps
"""
message: str
short_term_memory: str | None = None
long_term_memory: str | None = None
details: dict[str, Any] | None = None
while_handling_event: BaseEvent[Any] | None = None
def __init__(self, message: str, details: dict[str, Any] | None = None, event: BaseEvent[Any] | None = None):
def __init__(
self,
message: str,
short_term_memory: str | None = None,
long_term_memory: str | None = None,
details: dict[str, Any] | None = None,
event: BaseEvent[Any] | None = None,
):
"""Initialize a BrowserError with structured memory contexts.
Args:
message: Technical error message for logging and debugging
short_term_memory: Context shown once to LLM (e.g., available actions, options)
long_term_memory: Persistent error info stored in agent memory
details: Additional metadata for debugging
event: The browser event that triggered this error
"""
self.message = message
super().__init__(message)
self.short_term_memory = short_term_memory
self.long_term_memory = long_term_memory
self.details = details
self.while_handling_event = event
super().__init__(message)
def __str__(self) -> str:
if self.details:

File diff suppressed because it is too large Load Diff

View File

@@ -16,6 +16,8 @@ from browser_use.dom.views import (
EnhancedDOMTreeNode,
SerializedDOMState,
)
from browser_use.observability import observe_debug
from browser_use.utils import time_execution_async
if TYPE_CHECKING:
from browser_use.browser.views import BrowserStateSummary, PageInfo
@@ -42,70 +44,7 @@ class DOMWatchdog(BaseWatchdog):
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
# self.logger.debug('Setting up init scripts in browser')
self.logger.debug('💉 Injecting DOM Service init script to track event listeners added to DOM elements by JS...')
init_script = """
// check to make sure we're not inside the PDF viewer
window.isPdfViewer = !!document?.body?.querySelector('body > embed[type="application/pdf"][width="100%"]')
if (!window.isPdfViewer) {
// Permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
(() => {
if (window._eventListenerTrackerInitialized) return;
window._eventListenerTrackerInitialized = true;
const originalAddEventListener = EventTarget.prototype.addEventListener;
const eventListenersMap = new WeakMap();
EventTarget.prototype.addEventListener = function(type, listener, options) {
if (typeof listener === "function") {
let listeners = eventListenersMap.get(this);
if (!listeners) {
listeners = [];
eventListenersMap.set(this, listeners);
}
listeners.push({
type,
listener,
listenerPreview: listener.toString().slice(0, 100),
options
});
}
return originalAddEventListener.call(this, type, listener, options);
};
window.getEventListenersForNode = (node) => {
const listeners = eventListenersMap.get(node) || [];
return listeners.map(({ type, listenerPreview, options }) => ({
type,
listenerPreview,
options
}));
};
})();
}
"""
# Try to inject the script, but don't fail if the Page domain isn't ready yet
# This can happen when a new tab is created and the CDP session isn't fully attached
try:
await self.browser_session._cdp_add_init_script(init_script)
except Exception as e:
if "'Page.addScriptToEvaluateOnNewDocument' wasn't found" in str(e):
self.logger.debug(f'Page domain not ready for new tab, skipping init script injection: {e}')
# The script will be injected when the page actually navigates
else:
# Re-raise other errors
raise
return None
def _get_recent_events_str(self, limit: int = 10) -> str | None:
"""Get the most recent events from the event bus as JSON.
@@ -164,10 +103,10 @@ class DOMWatchdog(BaseWatchdog):
self.logger.debug(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Got page URL: {page_url}')
if self.browser_session.agent_focus:
self.logger.debug(
f'📍 Current page URL: {page_url}, target_id: {self.browser_session.agent_focus.target_id}, session_id: {self.browser_session.agent_focus.session_id}'
f'Current page URL: {page_url}, target_id: {self.browser_session.agent_focus.target_id}, session_id: {self.browser_session.agent_focus.session_id}'
)
else:
self.logger.debug(f'📍 Current page URL: {page_url}, no cdp_session attached')
self.logger.debug(f'Current page URL: {page_url}, no cdp_session attached')
# check if we should skip DOM tree build for pointless pages
not_a_meaningful_website = page_url.lower().split(':', 1)[0] not in ('http', 'https')
@@ -243,65 +182,73 @@ class DOMWatchdog(BaseWatchdog):
recent_events=self._get_recent_events_str() if event.include_recent_events else None,
)
# Normal path: Build DOM tree if requested
if event.include_dom:
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Building DOM tree...')
# Execute DOM building and screenshot capture in parallel
dom_task = None
screenshot_task = None
# Start DOM building task if requested
if event.include_dom:
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🌳 Starting DOM tree build task...')
# Build the DOM directly using the internal method
previous_state = (
self.browser_session._cached_browser_state_summary.dom_state
if self.browser_session._cached_browser_state_summary
else None
)
dom_task = asyncio.create_task(self._build_dom_tree_without_highlights(previous_state))
# Start clean screenshot task if requested (without JS highlights)
if event.include_screenshot:
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 Starting clean screenshot task...')
screenshot_task = asyncio.create_task(self._capture_clean_screenshot())
# Wait for both tasks to complete
content = None
screenshot_b64 = None
if dom_task:
try:
# Call the DOM building method directly
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: Starting _build_dom_tree...')
content = await self._build_dom_tree(previous_state)
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ _build_dom_tree completed')
content = await dom_task
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ DOM tree build completed')
except Exception as e:
self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: DOM build failed: {e}, using minimal state')
content = SerializedDOMState(_root=None, selector_map={})
if not content:
# Fallback to minimal DOM state
self.logger.warning('DOM build returned no content, using minimal state')
content = SerializedDOMState(_root=None, selector_map={})
else:
# Skip DOM building if not requested
content = SerializedDOMState(_root=None, selector_map={})
# re-focus top-level page session context
assert self.browser_session.agent_focus is not None, 'No current target ID'
await self.browser_session.get_or_create_cdp_session(target_id=self.browser_session.agent_focus.target_id, focus=True)
# Get screenshot if requested
screenshot_b64 = None
if event.include_screenshot:
self.logger.debug(
f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: 📸 DOM watchdog requesting screenshot, include_screenshot={event.include_screenshot}'
)
if screenshot_task:
try:
# Check if handler is registered
handlers = self.event_bus.handlers.get('ScreenshotEvent', [])
handler_names = [getattr(h, '__name__', str(h)) for h in handlers]
self.logger.debug(f'📸 ScreenshotEvent handlers registered: {len(handlers)} - {handler_names}')
screenshot_event = self.event_bus.dispatch(ScreenshotEvent(full_page=False))
self.logger.debug('📸 Dispatched ScreenshotEvent, waiting for event to complete...')
# Wait for the event itself to complete (this waits for all handlers)
await screenshot_event
# Get the single handler result
screenshot_b64 = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True)
except TimeoutError:
self.logger.warning('📸 Screenshot timed out after 6 seconds - no handler registered or slow page?')
screenshot_b64 = await screenshot_task
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Clean screenshot captured')
except Exception as e:
self.logger.warning(f'📸 Screenshot failed: {type(e).__name__}: {e}')
else:
self.logger.debug(f'📸 Skipping screenshot, include_screenshot={event.include_screenshot}')
self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Clean screenshot failed: {e}')
screenshot_b64 = None
# Apply Python-based highlighting if both DOM and screenshot are available
if screenshot_b64 and content and content.selector_map and self.browser_session.browser_profile.highlight_elements:
try:
self.logger.debug('🔍 DOMWatchdog.on_BrowserStateRequestEvent: 🎨 Applying Python-based highlighting...')
from browser_use.browser.python_highlights import create_highlighted_screenshot_async
# Get CDP session for viewport info
cdp_session = await self.browser_session.get_or_create_cdp_session()
start = time.time()
screenshot_b64 = await create_highlighted_screenshot_async(
screenshot_b64,
content.selector_map,
cdp_session,
self.browser_session.browser_profile.filter_highlight_ids,
)
self.logger.debug(
f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: ✅ Applied highlights to {len(content.selector_map)} elements in {time.time() - start:.2f}s'
)
except Exception as e:
self.logger.warning(f'🔍 DOMWatchdog.on_BrowserStateRequestEvent: Python highlighting failed: {e}')
# Ensure we have valid content
if not content:
content = SerializedDOMState(_root=None, selector_map={})
# Tabs info already fetched at the beginning
@@ -452,7 +399,7 @@ class DOMWatchdog(BaseWatchdog):
self.logger.debug(f'🔍 DOMWatchdog._build_dom_tree: ✅ Selector maps updated, {len(self.selector_map)} elements')
# Inject highlighting for visual feedback if we have elements
if self.selector_map and self._dom_service:
if self.selector_map and self._dom_service and self.browser_session.browser_profile.highlight_elements:
try:
self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Injecting highlighting script...')
from browser_use.dom.debug.highlights import inject_highlighting_script
@@ -463,6 +410,8 @@ class DOMWatchdog(BaseWatchdog):
)
except Exception as e:
self.logger.debug(f'🔍 DOMWatchdog._build_dom_tree: Failed to inject highlighting: {e}')
elif self.selector_map and self._dom_service and not self.browser_session.browser_profile.highlight_elements:
self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Skipping highlighting injection - highlight_elements=False')
self.logger.debug('🔍 DOMWatchdog._build_dom_tree: ✅ COMPLETED DOM tree build')
return self.current_dom_state
@@ -477,6 +426,95 @@ class DOMWatchdog(BaseWatchdog):
)
raise
@time_execution_async('build_dom_tree_without_highlights')
@observe_debug(ignore_input=True, ignore_output=True, name='build_dom_tree_without_highlights')
async def _build_dom_tree_without_highlights(self, previous_state: SerializedDOMState | None = None) -> SerializedDOMState:
"""Build DOM tree without injecting JavaScript highlights (for parallel execution)."""
try:
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: STARTING DOM tree build')
# Create or reuse DOM service
if self._dom_service is None:
self._dom_service = DomService(
browser_session=self.browser_session,
logger=self.logger,
cross_origin_iframes=self.browser_session.browser_profile.cross_origin_iframes,
)
# Get serialized DOM tree using the service
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: Calling DomService.get_serialized_dom_tree...')
start = time.time()
self.current_dom_state, self.enhanced_dom_tree, timing_info = await self._dom_service.get_serialized_dom_tree(
previous_cached_state=previous_state,
)
end = time.time()
self.logger.debug(
'🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ DomService.get_serialized_dom_tree completed'
)
self.logger.debug(f'Time taken to get DOM tree: {end - start} seconds')
self.logger.debug(f'Timing breakdown: {timing_info}')
# Update selector map for other watchdogs
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: Updating selector maps...')
self.selector_map = self.current_dom_state.selector_map
# Update BrowserSession's cached selector map
if self.browser_session:
self.browser_session.update_cached_selector_map(self.selector_map)
self.logger.debug(
f'🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ Selector maps updated, {len(self.selector_map)} elements'
)
# Skip JavaScript highlighting injection - Python highlighting will be applied later
self.logger.debug('🔍 DOMWatchdog._build_dom_tree_without_highlights: ✅ COMPLETED DOM tree build (no JS highlights)')
return self.current_dom_state
except Exception as e:
self.logger.error(f'Failed to build DOM tree without highlights: {e}')
self.event_bus.dispatch(
BrowserErrorEvent(
error_type='DOMBuildFailed',
message=str(e),
)
)
raise
@time_execution_async('capture_clean_screenshot')
@observe_debug(ignore_input=True, ignore_output=True, name='capture_clean_screenshot')
async def _capture_clean_screenshot(self) -> str:
"""Capture a clean screenshot without JavaScript highlights."""
try:
self.logger.debug('🔍 DOMWatchdog._capture_clean_screenshot: Capturing clean screenshot...')
# Ensure we have a focused CDP session
assert self.browser_session.agent_focus is not None, 'No current target ID'
await self.browser_session.get_or_create_cdp_session(target_id=self.browser_session.agent_focus.target_id, focus=True)
# Check if handler is registered
handlers = self.event_bus.handlers.get('ScreenshotEvent', [])
handler_names = [getattr(h, '__name__', str(h)) for h in handlers]
self.logger.debug(f'📸 ScreenshotEvent handlers registered: {len(handlers)} - {handler_names}')
screenshot_event = self.event_bus.dispatch(ScreenshotEvent(full_page=False))
self.logger.debug('📸 Dispatched ScreenshotEvent, waiting for event to complete...')
# Wait for the event itself to complete (this waits for all handlers)
await screenshot_event
# Get the single handler result
screenshot_b64 = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True)
if screenshot_b64 is None:
raise RuntimeError('Screenshot handler returned None')
self.logger.debug('🔍 DOMWatchdog._capture_clean_screenshot: ✅ Clean screenshot captured successfully')
return str(screenshot_b64)
except TimeoutError:
self.logger.warning('📸 Clean screenshot timed out after 6 seconds - no handler registered or slow page?')
raise
except Exception as e:
self.logger.warning(f'📸 Clean screenshot failed: {type(e).__name__}: {e}')
raise
async def _wait_for_stable_network(self):
"""Wait for page stability - simplified for CDP-only branch."""
start_time = time.time()
@@ -496,6 +534,7 @@ class DOMWatchdog(BaseWatchdog):
elapsed = time.time() - start_time
self.logger.debug(f'✅ Page stability wait completed in {elapsed:.2f}s')
@observe_debug(ignore_input=True, ignore_output=True, name='get_page_info')
async def _get_page_info(self) -> 'PageInfo':
"""Get comprehensive page information using a single CDP call.

View File

@@ -111,9 +111,10 @@ class DownloadsWatchdog(BaseWatchdog):
# Check if auto-download is enabled
auto_download_enabled = self._is_auto_download_enabled()
if not auto_download_enabled:
self.logger.debug('[DownloadsWatchdog] Skipping PDF check - auto-download disabled')
return
# Note: Using network-based PDF detection that doesn't require JavaScript
target_id = event.target_id
self.logger.debug(f'[DownloadsWatchdog] Got target_id={target_id} for tab #{event.target_id[-4:]}')
@@ -552,8 +553,9 @@ class DownloadsWatchdog(BaseWatchdog):
del self._active_downloads[download_id]
async def check_for_pdf_viewer(self, target_id: TargetID) -> bool:
"""Check if the current target is Chrome's built-in PDF viewer.
"""Check if the current target is a PDF using network-based detection.
This method avoids JavaScript execution that can crash WebSocket connections.
Returns True if a PDF is detected and should be downloaded.
"""
self.logger.debug(f'[DownloadsWatchdog] Checking if target {target_id} is PDF viewer...')
@@ -575,98 +577,115 @@ class DownloadsWatchdog(BaseWatchdog):
return cached_result
try:
# Create a temporary CDP session for this target without switching focus
import asyncio
temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
result = await asyncio.wait_for(
temp_session.cdp_client.send.Runtime.evaluate(
params={
'expression': """
(() => {
// Check for Chrome's built-in PDF viewer (both old and new selectors)
const pdfEmbed = document.querySelector('embed[type="application/x-google-chrome-pdf"]') ||
document.querySelector('embed[type="application/pdf"]');
if (pdfEmbed) {
// For Chrome PDF viewer, use window.location.href not embed.src (which is often about:blank)
return {
isPdf: true,
url: window.location.href,
isChromePdfViewer: true
};
}
// Check for direct PDF navigation
if (document.contentType === 'application/pdf') {
return {
isPdf: true,
url: window.location.href,
isDirectPdf: true
};
}
// Also check if the URL ends with .pdf or has PDF in it
const url = window.location.href;
const isPdfUrl = url.toLowerCase().includes('.pdf');
if (isPdfUrl) {
return {
isPdf: true,
url: url,
isPdfUrl: true
};
}
// Check for PDF in iframe
const iframes = document.querySelectorAll('iframe');
for (const iframe of iframes) {
try {
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
if (iframeDoc.contentType === 'application/pdf') {
return {
isPdf: true,
url: iframe.src,
isIframePdf: true
};
}
} catch (e) {
// Cross-origin iframe, skip
}
}
return { isPdf: false };
})()
""",
'returnByValue': True,
},
session_id=temp_session.session_id,
),
timeout=5.0, # 5 second timeout to prevent hanging
)
# No need to detach - session is cached
is_pdf_viewer = result.get('result', {}).get('value', {})
if is_pdf_viewer.get('isPdf', False):
self.logger.debug(
f'[DownloadsWatchdog] PDF detected: {is_pdf_viewer.get("url", "unknown")} '
f'(type: {"Chrome viewer" if is_pdf_viewer.get("isChromePdfViewer") else "direct PDF" if is_pdf_viewer.get("isDirectPdf") else "PDF URL" if is_pdf_viewer.get("isPdfUrl") else "iframe PDF"})'
)
# Method 1: Check URL patterns (fastest, most reliable)
url_is_pdf = self._check_url_for_pdf(page_url)
if url_is_pdf:
self.logger.debug(f'[DownloadsWatchdog] PDF detected via URL pattern: {page_url}')
self._pdf_viewer_cache[page_url] = True
return True
# Method 2: Check network response headers via CDP (safer than JavaScript)
header_is_pdf = await self._check_network_headers_for_pdf(target_id)
if header_is_pdf:
self.logger.debug(f'[DownloadsWatchdog] PDF detected via network headers: {page_url}')
self._pdf_viewer_cache[page_url] = True
return True
# Method 3: Check Chrome's PDF viewer specific URLs
chrome_pdf_viewer = self._is_chrome_pdf_viewer_url(page_url)
if chrome_pdf_viewer:
self.logger.debug(f'[DownloadsWatchdog] Chrome PDF viewer detected: {page_url}')
self._pdf_viewer_cache[page_url] = True
return True
# Not a PDF
self._pdf_viewer_cache[page_url] = False
return False
except TimeoutError:
self.logger.warning(f'[DownloadsWatchdog] ❌ PDF check timed out for target: {page_url}')
self._pdf_viewer_cache[page_url] = False
return False
except Exception as e:
self.logger.warning(f'[DownloadsWatchdog] ❌ Error checking for PDF viewer: {e}')
self._pdf_viewer_cache[page_url] = False
return False
def _check_url_for_pdf(self, url: str) -> bool:
"""Check if URL indicates a PDF file."""
if not url:
return False
url_lower = url.lower()
# Direct PDF file extensions
if url_lower.endswith('.pdf'):
return True
# PDF in path
if '.pdf' in url_lower:
return True
# PDF MIME type in URL parameters
if any(
param in url_lower
for param in [
'content-type=application/pdf',
'content-type=application%2fpdf',
'mimetype=application/pdf',
'type=application/pdf',
]
):
return True
return False
def _is_chrome_pdf_viewer_url(self, url: str) -> bool:
"""Check if this is Chrome's internal PDF viewer URL."""
if not url:
return False
url_lower = url.lower()
# Chrome PDF viewer uses chrome-extension:// URLs
if 'chrome-extension://' in url_lower and 'pdf' in url_lower:
return True
# Chrome PDF viewer internal URLs
if url_lower.startswith('chrome://') and 'pdf' in url_lower:
return True
return False
async def _check_network_headers_for_pdf(self, target_id: TargetID) -> bool:
"""Infer PDF via navigation history/URL; headers are not available post-navigation in this context."""
try:
import asyncio
# Get CDP session
temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
# Get navigation history to find the main resource
history = await asyncio.wait_for(
temp_session.cdp_client.send.Page.getNavigationHistory(session_id=temp_session.session_id), timeout=3.0
)
current_entry = history.get('entries', [])
if current_entry:
current_index = history.get('currentIndex', 0)
if 0 <= current_index < len(current_entry):
current_url = current_entry[current_index].get('url', '')
# Check if the URL itself suggests PDF
if self._check_url_for_pdf(current_url):
return True
# Note: CDP doesn't easily expose response headers for completed navigations
# For more complex cases, we'd need to set up Network.responseReceived listeners
# before navigation, but that's overkill for most PDF detection cases
return False
except Exception as e:
self.logger.debug(f'[DownloadsWatchdog] Network headers check failed (non-critical): {e}')
return False
async def trigger_pdf_download(self, target_id: TargetID) -> str | None:
"""Trigger download of a PDF from Chrome's PDF viewer.

View File

@@ -100,7 +100,7 @@ async def inject_highlighting_script(dom_service: DomService, interactive_elemen
# Convert DOMSelectorMap to the format expected by the JavaScript
converted_elements = convert_dom_selector_map_to_highlight_format(interactive_elements)
logger.debug(f'📍 Creating CSP-safe highlighting for {len(converted_elements)} elements')
logger.debug(f'Creating CSP-safe highlighting for {len(converted_elements)} elements')
# ALWAYS remove any existing highlights first to prevent double-highlighting
await remove_highlighting_script(dom_service)

View File

@@ -20,7 +20,7 @@ class ClickableElementDetector:
# IFRAME elements should be interactive if they're large enough to potentially need scrolling
# Small iframes (< 100px width or height) are unlikely to have scrollable content
if node.tag_name and node.tag_name.upper() == 'IFRAME':
if node.tag_name and node.tag_name.upper() == 'IFRAME' or node.tag_name.upper() == 'FRAME':
if node.snapshot_node and node.snapshot_node.bounds:
width = node.snapshot_node.bounds.width
height = node.snapshot_node.bounds.height
@@ -94,14 +94,14 @@ class ClickableElementDetector:
# Skip properties we can't process
continue
# ENHANCED TAG CHECK: Include truly interactive elements
# ENHANCED TAG CHECK: Include truly interactive elements
# Note: 'label' removed - labels are handled by other attribute checks below - other wise labels with "for" attribute can destroy the real clickable element on apartments.com
interactive_tags = {
'button',
'input',
'select',
'textarea',
'a',
'label',
'details',
'summary',
'option',

View File

@@ -143,10 +143,10 @@ class DOMTreeSerializer:
if node.node_name.lower() in DISABLED_ELEMENTS:
return None
if node.node_name == 'IFRAME':
if node.node_name == 'IFRAME' or node.node_name == 'FRAME':
if node.content_document:
simplified = SimplifiedNode(original_node=node, children=[])
for child in node.content_document.children:
for child in node.content_document.children_nodes or []:
simplified_child = self._create_simplified_tree(child)
if simplified_child:
simplified.children.append(simplified_child)
@@ -159,7 +159,7 @@ class DOMTreeSerializer:
is_scrollable = node.is_actually_scrollable
# Include if interactive (regardless of visibility), or scrollable, or has children to process
should_include = (is_interactive and is_visible) or is_scrollable or node.children_and_shadow_roots
should_include = (is_interactive and is_visible) or is_scrollable or bool(node.children_and_shadow_roots)
if should_include:
simplified = SimplifiedNode(original_node=node, children=[])
@@ -435,7 +435,12 @@ class DOMTreeSerializer:
# Add element with interactive_index if clickable, scrollable, or iframe
is_any_scrollable = node.original_node.is_actually_scrollable or node.original_node.is_scrollable
should_show_scroll = node.original_node.should_show_scroll_info
if node.interactive_index is not None or is_any_scrollable or node.original_node.tag_name.upper() == 'IFRAME':
if (
node.interactive_index is not None
or is_any_scrollable
or node.original_node.tag_name.upper() == 'IFRAME'
or node.original_node.tag_name.upper() == 'FRAME'
):
next_depth += 1
# Build attributes string
@@ -453,6 +458,9 @@ class DOMTreeSerializer:
elif node.original_node.tag_name.upper() == 'IFRAME':
# Iframe element (not interactive)
line = f'{depth_str}|IFRAME|<{node.original_node.tag_name}'
elif node.original_node.tag_name.upper() == 'FRAME':
# Frame element (not interactive)
line = f'{depth_str}|FRAME|<{node.original_node.tag_name}'
else:
line = f'{depth_str}<{node.original_node.tag_name}'

View File

@@ -187,7 +187,7 @@ class DomService:
for frame in reversed(html_frames):
if (
frame.node_type == NodeType.ELEMENT_NODE
and frame.node_name.upper() == 'IFRAME'
and (frame.node_name.upper() == 'IFRAME' or frame.node_name.upper() == 'FRAME')
and frame.snapshot_node
and frame.snapshot_node.bounds
):
@@ -561,7 +561,11 @@ class DomService:
)
# Calculate new iframe offset for content documents, accounting for iframe scroll
if node['nodeName'].upper() == 'IFRAME' and snapshot_data and snapshot_data.bounds:
if (
(node['nodeName'].upper() == 'IFRAME' or node['nodeName'].upper() == 'FRAME')
and snapshot_data
and snapshot_data.bounds
):
if snapshot_data.bounds:
updated_html_frames.append(dom_tree_node)

View File

@@ -12,6 +12,7 @@ from cdp_use.cdp.target.types import SessionID, TargetID, TargetInfo
from uuid_extensions import uuid7str
from browser_use.dom.utils import cap_text_length
from browser_use.observability import observe_debug
# Serializer types
DEFAULT_INCLUDE_ATTRIBUTES = [
@@ -91,14 +92,28 @@ class SimplifiedNode:
is_new: bool = False
excluded_by_parent: bool = False # New field for bbox filtering
def _clean_original_node_json(self, node_json: dict) -> dict:
"""Recursively remove children_nodes and shadow_roots from original_node JSON."""
# Remove the fields we don't want in SimplifiedNode serialization
if 'children_nodes' in node_json:
del node_json['children_nodes']
if 'shadow_roots' in node_json:
del node_json['shadow_roots']
# Clean nested content_document if it exists
if node_json.get('content_document'):
node_json['content_document'] = self._clean_original_node_json(node_json['content_document'])
return node_json
def __json__(self) -> dict:
original_node_json = self.original_node.__json__()
del original_node_json['children_nodes']
del original_node_json['shadow_roots']
# Remove children_nodes and shadow_roots to avoid duplication with SimplifiedNode.children
cleaned_original_node_json = self._clean_original_node_json(original_node_json)
return {
'should_display': self.should_display,
'interactive_index': self.interactive_index,
'original_node': original_node_json,
'original_node': cleaned_original_node_json,
'children': [c.__json__() for c in self.children],
}
@@ -412,6 +427,25 @@ class EnhancedDOMTreeNode:
return f'<{self.tag_name}>{cap_text_length(self.get_all_children_text(), max_text_length) or ""}'
def get_meaningful_text_for_llm(self) -> str:
"""
Get the meaningful text content that the LLM actually sees for this element.
This matches exactly what goes into the DOMTreeSerializer output.
"""
meaningful_text = ''
if hasattr(self, 'attributes') and self.attributes:
# Priority order: value, aria-label, title, placeholder, alt, text content
for attr in ['value', 'aria-label', 'title', 'placeholder', 'alt']:
if attr in self.attributes and self.attributes[attr]:
meaningful_text = self.attributes[attr]
break
# Fallback to text content if no meaningful attributes
if not meaningful_text:
meaningful_text = self.get_all_children_text()
return meaningful_text.strip()
@property
def is_actually_scrollable(self) -> bool:
"""
@@ -677,6 +711,7 @@ class SerializedDOMState:
selector_map: DOMSelectorMap
@observe_debug(ignore_input=True, ignore_output=True, name='llm_representation')
def llm_representation(
self,
include_attributes: list[str] | None = None,

View File

@@ -379,6 +379,8 @@ class Registry(Generic[Context]):
raise RuntimeError(str(e)) from e
else:
raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
except TimeoutError as e:
raise RuntimeError(f'Error executing action {action_name} due to timeout.') from e
except Exception as e:
raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e

View File

@@ -65,26 +65,19 @@ Context = TypeVar('Context')
T = TypeVar('T', bound=BaseModel)
def extract_llm_error_message(error: Exception) -> str:
"""
Extract the clean error message from an exception that may contain <llm_error_msg> tags.
If the tags are found, returns the content between them.
Otherwise, returns the original error string.
"""
import re
error_str = str(error)
# Look for content between <llm_error_msg> tags
pattern = r'<llm_error_msg>(.*?)</llm_error_msg>'
match = re.search(pattern, error_str, re.DOTALL)
if match:
return match.group(1).strip()
# Fallback: return the original error string
return error_str
def handle_browser_error(e: BrowserError) -> ActionResult:
if e.long_term_memory is not None:
if e.short_term_memory is not None:
return ActionResult(
extracted_content=e.short_term_memory, error=e.long_term_memory, include_extracted_content_only_once=True
)
else:
return ActionResult(error=e.long_term_memory)
# Fallback to original error handling if long_term_memory is None
logger.warning(
'⚠️ A BrowserError was raised without long_term_memory - always set long_term_memory when raising BrowserError to propagate right messages to LLM.'
)
raise e
class Tools(Generic[Context]):
@@ -177,11 +170,10 @@ class Tools(Generic[Context]):
memory = f"Searched Google for '{params.query}'"
msg = f'🔍 {memory}'
logger.info(msg)
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
return ActionResult(extracted_content=memory, long_term_memory=memory)
except Exception as e:
logger.error(f'Failed to search Google: {e}')
clean_msg = extract_llm_error_message(e)
return ActionResult(error=f'Failed to search Google for "{params.query}": {clean_msg}')
return ActionResult(error=f'Failed to search Google for "{params.query}": {str(e)}')
@self.registry.action(
'Navigate to URL, set new_tab=True to open in new tab, False to navigate in current tab', param_model=GoToUrlAction
@@ -201,12 +193,11 @@ class Tools(Generic[Context]):
msg = f'🔗 {memory}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=memory)
return ActionResult(extracted_content=msg, long_term_memory=memory)
except Exception as e:
error_msg = str(e)
# Always log the actual error first for debugging
browser_session.logger.error(f'❌ Navigation failed: {error_msg}')
clean_msg = extract_llm_error_message(e)
# Check if it's specifically a RuntimeError about CDP client
if isinstance(e, RuntimeError) and 'CDP client not initialized' in error_msg:
@@ -223,12 +214,12 @@ class Tools(Generic[Context]):
'net::',
]
):
site_unavailable_msg = f'Site unavailable: {params.url} - {error_msg}'
browser_session.logger.warning(f'⚠️ {site_unavailable_msg}')
site_unavailable_msg = f'Navigation failed - site unavailable: {params.url}'
browser_session.logger.warning(f'⚠️ {site_unavailable_msg} - {error_msg}')
return ActionResult(error=site_unavailable_msg)
else:
# Return error in ActionResult instead of re-raising
return ActionResult(error=f'Navigation failed: {clean_msg}')
return ActionResult(error=f'Navigation failed: {str(e)}')
@self.registry.action('Go back', param_model=NoParamsAction)
async def go_back(_: NoParamsAction, browser_session: BrowserSession):
@@ -241,8 +232,7 @@ class Tools(Generic[Context]):
return ActionResult(extracted_content=memory)
except Exception as e:
logger.error(f'Failed to dispatch GoBackEvent: {type(e).__name__}: {e}')
clean_msg = extract_llm_error_message(e)
error_msg = f'Failed to go back: {clean_msg}'
error_msg = f'Failed to go back: {str(e)}'
return ActionResult(error=error_msg)
@self.registry.action(
@@ -285,23 +275,18 @@ class Tools(Generic[Context]):
# Wait for handler to complete and get any exception or metadata
click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
memory = f'Clicked element with index {params.index}'
if params.while_holding_ctrl:
memory += ' and opened in new tab'
msg = f'🖱️ {memory}'
logger.info(msg)
# Include click coordinates in metadata if available
return ActionResult(
extracted_content=memory,
include_in_memory=True,
long_term_memory=memory,
metadata=click_metadata if isinstance(click_metadata, dict) else None,
)
except Exception as e:
logger.error(f'Failed to execute ClickElementEvent: {type(e).__name__}: {e}')
clean_msg = extract_llm_error_message(e)
error_msg = f'Failed to click element {params.index}: {clean_msg}'
# If it's a select dropdown error, automatically get the dropdown options
if 'dropdown' in str(e) and node:
except BrowserError as e:
if 'Cannot click on <select> elements.' in str(e):
try:
return await get_dropdown_options(
params=GetDropdownOptionsAction(index=params.index), browser_session=browser_session
@@ -311,6 +296,9 @@ class Tools(Generic[Context]):
f'Failed to get dropdown options as shortcut during click_element_by_index on dropdown: {type(dropdown_error).__name__}: {dropdown_error}'
)
return handle_browser_error(e)
except Exception as e:
error_msg = f'Failed to click element {params.index}: {str(e)}'
return ActionResult(error=error_msg)
@self.registry.action(
@@ -336,10 +324,11 @@ class Tools(Generic[Context]):
# Include input coordinates in metadata if available
return ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=f"Input '{params.text}' into element {params.index}.",
metadata=input_metadata if isinstance(input_metadata, dict) else None,
)
except BrowserError as e:
return handle_browser_error(e)
except Exception as e:
# Log the full error for debugging
logger.error(f'Failed to dispatch TypeTextEvent: {type(e).__name__}: {e}')
@@ -370,27 +359,28 @@ class Tools(Generic[Context]):
if not browser_session.is_local:
pass
else:
raise BrowserError(
f'File path {params.path} is not available. Must be in available_file_paths, downloaded_files, or a file managed by file_system.'
)
msg = f'File path {params.path} is not available. Upload files must be in available_file_paths, downloaded_files, or a file managed by file_system.'
logger.error(f'{msg}')
return ActionResult(error=msg)
else:
# If browser is remote, allow passing a remote-accessible absolute path
if not browser_session.is_local:
pass
else:
raise BrowserError(
f'File path {params.path} is not available. Must be in available_file_paths or downloaded_files.'
)
msg = f'File path {params.path} is not available. Upload files must be in available_file_paths, downloaded_files, or a file managed by file_system.'
raise BrowserError(message=msg, long_term_memory=msg)
# For local browsers, ensure the file exists on the local filesystem
if browser_session.is_local:
if not os.path.exists(params.path):
raise BrowserError(f'File {params.path} does not exist')
msg = f'File {params.path} does not exist'
return ActionResult(error=msg)
# Get the selector map to find the node
selector_map = await browser_session.get_selector_map()
if params.index not in selector_map:
raise BrowserError(f'Element with index {params.index} not found in selector map')
msg = f'Element with index {params.index} does not exist.'
return ActionResult(error=msg)
node = selector_map[params.index]
@@ -486,7 +476,6 @@ class Tools(Generic[Context]):
logger.info(f'📁 {msg}')
return ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=f'Uploaded file {params.path} to element {params.index}',
)
except Exception as e:
@@ -499,12 +488,7 @@ class Tools(Generic[Context]):
async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession):
# Dispatch switch tab event
try:
if params.tab_id:
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
elif params.url:
target_id = await browser_session.get_target_id_from_url(params.url)
else:
target_id = await browser_session.get_most_recently_opened_target_id()
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
await event
@@ -512,11 +496,10 @@ class Tools(Generic[Context]):
assert new_target_id, 'SwitchTabEvent did not return a TargetID for the new tab that was switched to'
memory = f'Switched to Tab with ID {new_target_id[-4:]}'
logger.info(f'🔄 {memory}')
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
return ActionResult(extracted_content=memory, long_term_memory=memory)
except Exception as e:
logger.error(f'Failed to switch tab: {type(e).__name__}: {e}')
clean_msg = extract_llm_error_message(e)
return ActionResult(error=f'Failed to switch to tab {params.tab_id or params.url}: {clean_msg}')
return ActionResult(error=f'Failed to switch to tab {params.tab_id}.')
@self.registry.action('Close an existing tab', param_model=CloseTabAction)
async def close_tab(params: CloseTabAction, browser_session: BrowserSession):
@@ -535,13 +518,11 @@ class Tools(Generic[Context]):
logger.info(f'🗑️ {memory}')
return ActionResult(
extracted_content=memory,
include_in_memory=True,
long_term_memory=memory,
)
except Exception as e:
logger.error(f'Failed to close tab: {e}')
clean_msg = extract_llm_error_message(e)
return ActionResult(error=f'Failed to close tab {params.tab_id}: {clean_msg}')
return ActionResult(error=f'Failed to close tab {params.tab_id}.')
# Content Actions
@@ -697,11 +678,10 @@ Provide the extracted information in a clear, structured format."""
msg = f'🔍 {long_term_memory}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=long_term_memory)
return ActionResult(extracted_content=msg, long_term_memory=long_term_memory)
except Exception as e:
logger.error(f'Failed to dispatch ScrollEvent: {type(e).__name__}: {e}')
clean_msg = extract_llm_error_message(e)
error_msg = f'Failed to scroll: {clean_msg}'
error_msg = 'Failed to execute scroll action.'
return ActionResult(error=error_msg)
@self.registry.action(
@@ -717,11 +697,10 @@ Provide the extracted information in a clear, structured format."""
memory = f'Sent keys: {params.keys}'
msg = f'⌨️ {memory}'
logger.info(msg)
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
return ActionResult(extracted_content=memory, long_term_memory=memory)
except Exception as e:
logger.error(f'Failed to dispatch SendKeysEvent: {type(e).__name__}: {e}')
clean_msg = extract_llm_error_message(e)
error_msg = f'Failed to send keys: {clean_msg}'
error_msg = f'Failed to send keys: {str(e)}'
return ActionResult(error=error_msg)
@self.registry.action(
@@ -737,14 +716,13 @@ Provide the extracted information in a clear, structured format."""
memory = f'Scrolled to text: {text}'
msg = f'🔍 {memory}'
logger.info(msg)
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
return ActionResult(extracted_content=memory, long_term_memory=memory)
except Exception as e:
# Text not found
msg = f"Text '{text}' not found or not visible on page"
logger.info(msg)
return ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=f"Tried scrolling to text '{text}' but it was not found",
)
@@ -762,7 +740,6 @@ Provide the extracted information in a clear, structured format."""
raise ValueError(f'Element index {params.index} not found in DOM')
# Dispatch GetDropdownOptionsEvent to the event handler
import json
event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node))
dropdown_data = await event.event_result(timeout=3.0, raise_if_none=True, raise_if_any=True)
@@ -770,14 +747,10 @@ Provide the extracted information in a clear, structured format."""
if not dropdown_data:
raise ValueError('Failed to get dropdown options - no data returned')
# Extract the message from the returned data
msg = dropdown_data.get('message', '')
options_count = len(json.loads(dropdown_data.get('options', '[]'))) # Parse the string back to list to get count
# Use structured memory from the handler
return ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=f'Found {options_count} dropdown options for index {params.index}',
extracted_content=dropdown_data['short_term_memory'],
long_term_memory=dropdown_data['long_term_memory'],
include_extracted_content_only_once=True,
)
@@ -801,14 +774,28 @@ Provide the extracted information in a clear, structured format."""
if not selection_data:
raise ValueError('Failed to select dropdown option - no data returned')
# Extract the message from the returned data
msg = selection_data.get('message', f'Selected option: {params.text}')
return ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}",
)
# Check if the selection was successful
if selection_data.get('success') == 'true':
# Extract the message from the returned data
msg = selection_data.get('message', f'Selected option: {params.text}')
return ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}",
)
else:
# Handle structured error response
# TODO: raise BrowserError instead of returning ActionResult
if 'short_term_memory' in selection_data and 'long_term_memory' in selection_data:
return ActionResult(
extracted_content=selection_data['short_term_memory'],
long_term_memory=selection_data['long_term_memory'],
include_extracted_content_only_once=True,
)
else:
# Fallback to regular error
error_msg = selection_data.get('error', f'Failed to select option: {params.text}')
return ActionResult(error=error_msg)
# File System Actions
@self.registry.action(
@@ -831,7 +818,7 @@ Provide the extracted information in a clear, structured format."""
else:
result = await file_system.write_file(file_name, content)
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
return ActionResult(extracted_content=result, long_term_memory=result)
@self.registry.action(
'Replace old_str with new_str in file_name. old_str must exactly match the string to replace in original text. Recommended tool to mark completed items in todo.md or change specific contents in a file.'
@@ -839,7 +826,7 @@ Provide the extracted information in a clear, structured format."""
async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem):
result = await file_system.replace_file_str(file_name, old_str, new_str)
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
return ActionResult(extracted_content=result, long_term_memory=result)
@self.registry.action('Read file_name from file system')
async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem):
@@ -866,7 +853,6 @@ Provide the extracted information in a clear, structured format."""
logger.info(f'💾 {memory}')
return ActionResult(
extracted_content=result,
include_in_memory=True,
long_term_memory=memory,
include_extracted_content_only_once=True,
)
@@ -1001,12 +987,16 @@ Provide the extracted information in a clear, structured format."""
sensitive_data=sensitive_data,
available_file_paths=available_file_paths,
)
except BrowserError as e:
logger.error(f'❌ Action {action_name} failed with BrowserError: {str(e)}')
result = handle_browser_error(e)
except TimeoutError as e:
logger.error(f'❌ Action {action_name} failed with TimeoutError: {str(e)}')
result = ActionResult(error=f'{action_name} was not executed due to timeout.')
except Exception as e:
# Log the original exception with traceback for observability
logger.error(f"Action '{action_name}' failed")
# Extract clean error message from llm_error_msg tags if present
clean_msg = extract_llm_error_message(e)
result = ActionResult(error=clean_msg)
logger.error(f"Action '{action_name}' failed with error: {str(e)}")
result = ActionResult(error=str(e))
if Laminar is not None:
Laminar.set_span_output(result)

View File

@@ -43,15 +43,10 @@ class StructuredOutputAction(BaseModel, Generic[T]):
class SwitchTabAction(BaseModel):
url: str | None = Field(
default=None,
description='URL or URL substring of the tab to switch to, if not provided, the tab_id or most recently opened tab will be used',
)
tab_id: str | None = Field(
default=None,
tab_id: str = Field(
min_length=4,
max_length=4,
description='exact 4 character Tab ID to match instead of URL, prefer using this if known',
description='Last 4 chars of TargetID',
) # last 4 chars of TargetID

View File

@@ -129,31 +129,6 @@
{
"tab": "Cloud",
"versions": [
{
"version": "v2",
"groups": [
{
"group": "Get Started",
"pages": [
"cloud/v2/quickstart",
"cloud/v2/python-quickstart",
"cloud/v2/node-quickstart"
]
},
{
"group": "Platform",
"pages": [
"cloud/v1/pricing",
"cloud/v1/n8n-browser-use-integration",
"cloud/v1/search"
]
},
{
"group": "REST API reference",
"openapi": "https://app.stainless.com/api/spec/documented/browser-use/openapi.documented.yml"
}
]
},
{
"version": "v1",
"groups": [
@@ -180,6 +155,27 @@
"openapi": "https://api.browser-use.com/api/v1/openapi.json"
}
]
},
{
"version": "v2",
"groups": [
{
"group": "Get Started",
"pages": [
"cloud/v2/quickstart",
"cloud/v2/python-quickstart",
"cloud/v2/node-quickstart"
]
},
{
"group": "Platform",
"pages": [
"cloud/v1/pricing",
"cloud/v1/n8n-browser-use-integration",
"cloud/v1/search"
]
}
]
}
]
}

View File

@@ -14,7 +14,7 @@ dependencies = [
"aiofiles>=24.1.0",
"aiohttp==3.12.15",
"anyio>=4.9.0",
"bubus>=1.5.4",
"bubus>=1.5.6",
"google-api-core>=2.25.0",
"httpx>=0.28.1",
"markdownify==1.1.0",

View File

@@ -17,11 +17,7 @@ import aiofiles
import yaml
from pydantic import BaseModel
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.session import BrowserSession
from browser_use.llm import ChatOpenAI
from browser_use import Agent, AgentHistoryList, BrowserProfile, BrowserSession, ChatOpenAI
from browser_use.llm.messages import UserMessage
# --- CONFIG ---

View File

@@ -185,11 +185,11 @@ class TestClickElementEvent:
# Verify the result structure
assert isinstance(result, ActionResult), 'Result should be an ActionResult instance'
assert result.error is None, f'Expected no error but got: {result.error}'
result_text = result.extracted_content or result.long_term_memory
# Core logic validation: Verify click was successful
assert result.extracted_content is not None
assert f'Clicked element with index {button_index}' in result.extracted_content, (
f'Expected click confirmation in result content, got: {result.extracted_content}'
assert result_text is not None
assert f'Clicked element with index {button_index}' in result_text, (
f'Expected click confirmation in result content, got: {result_text}'
)
# Note: The click action doesn't include button text in the result, only the index
@@ -260,7 +260,11 @@ class TestClickElementEvent:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
result_text = result.extracted_content or result.long_term_memory
assert result_text is not None
assert f'Clicked element with index {link_index}' in result_text, (
f'Expected click confirmation in result content, got: {result_text}'
)
# Verify that a new tab was opened
tabs = await browser_session.get_tabs()

View File

@@ -104,7 +104,6 @@ class TestScrollActions:
assert result.extracted_content is not None
assert 'Scrolled down' in result.extracted_content
assert 'the page' in result.extracted_content
assert result.include_in_memory is True
# Test 2: Basic page scroll up
scroll_up_action = {'scroll': ScrollAction(down=False, num_pages=0.5)}
@@ -123,7 +122,7 @@ class TestScrollActions:
# This should fail with error about element not found
assert isinstance(result, ActionResult)
assert result.error is not None, 'Expected error for invalid element index'
assert 'Element index 999 not found' in result.error or 'Failed to scroll' in result.error
assert 'Element index 999 not found' in result.error or 'Failed to execute scroll' in result.error
# Test 4: Model parameter validation
scroll_with_index = ScrollAction(down=True, num_pages=1.0, frame_element_index=5)

View File

@@ -394,10 +394,10 @@ class TestScreenshotEventSystem:
# Test the NEW event-driven path: direct event dispatching
event = browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False))
screenshot_result = (await event.event_result()) or {}
assert screenshot_result.get('screenshot')
assert isinstance(screenshot_result['screenshot'], str)
assert len(base64.b64decode(screenshot_result['screenshot'])) > 5000
screenshot_b64 = await event.event_result()
assert screenshot_b64 is not None
assert isinstance(screenshot_b64, str)
assert len(base64.b64decode(screenshot_b64)) > 5000
finally:
await browser_session.kill()