mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
Merge branch 'main' into local-remote-split
This commit is contained in:
@@ -37,7 +37,7 @@ class UpdateAgentTaskEvent(BaseEvent):
|
||||
if not hasattr(agent, '_task_start_time'):
|
||||
raise ValueError('Agent must have _task_start_time attribute')
|
||||
|
||||
done_output = agent.state.history.final_result() if agent.state.history else None
|
||||
done_output = agent.history.final_result() if agent.history else None
|
||||
return cls(
|
||||
id=str(agent.task_id),
|
||||
user_id='', # To be filled by cloud handler
|
||||
@@ -47,7 +47,7 @@ class UpdateAgentTaskEvent(BaseEvent):
|
||||
stopped=agent.state.stopped if hasattr(agent.state, 'stopped') else False,
|
||||
paused=agent.state.paused if hasattr(agent.state, 'paused') else False,
|
||||
done_output=done_output,
|
||||
finished_at=datetime.now(timezone.utc) if agent.state.history and agent.state.history.is_done() else None,
|
||||
finished_at=datetime.now(timezone.utc) if agent.history and agent.history.is_done() else None,
|
||||
agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
|
||||
user_feedback_type=None,
|
||||
user_comment=None,
|
||||
|
||||
@@ -61,28 +61,22 @@ def create_history_gif(
|
||||
logger.warning('No history to create GIF from')
|
||||
return
|
||||
|
||||
# Get all screenshots from history (including None placeholders)
|
||||
screenshots = history.screenshots(return_none_if_not_screenshot=True)
|
||||
|
||||
if not screenshots:
|
||||
logger.warning('No screenshots found in history')
|
||||
return
|
||||
|
||||
# Find the first non-placeholder screenshot
|
||||
# A screenshot is considered a placeholder if:
|
||||
# 1. It's the exact 4px placeholder for about:blank pages, OR
|
||||
# 2. It comes from a new tab page (chrome://newtab/, about:blank, etc.)
|
||||
first_real_screenshot = None
|
||||
for item in history.history:
|
||||
if not item.state.screenshot:
|
||||
continue
|
||||
|
||||
# Skip exact placeholder screenshots
|
||||
if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
|
||||
continue
|
||||
|
||||
# Skip screenshots from new tab pages
|
||||
from browser_use.utils import is_new_tab_page
|
||||
|
||||
if is_new_tab_page(item.state.url):
|
||||
continue
|
||||
|
||||
# This is a real screenshot from actual web content
|
||||
first_real_screenshot = item.state.screenshot
|
||||
break
|
||||
for screenshot in screenshots:
|
||||
if screenshot and screenshot != PLACEHOLDER_4PX_SCREENSHOT:
|
||||
first_real_screenshot = screenshot
|
||||
break
|
||||
|
||||
if not first_real_screenshot:
|
||||
logger.warning('No valid screenshots found (all are placeholders or from new tab pages)')
|
||||
@@ -145,8 +139,9 @@ def create_history_gif(
|
||||
# Find the first non-placeholder screenshot for the task frame
|
||||
first_real_screenshot = None
|
||||
for item in history.history:
|
||||
if item.state.screenshot and item.state.screenshot != PLACEHOLDER_4PX_SCREENSHOT:
|
||||
first_real_screenshot = item.state.screenshot
|
||||
screenshot_b64 = item.state.get_screenshot()
|
||||
if screenshot_b64 and screenshot_b64 != PLACEHOLDER_4PX_SCREENSHOT:
|
||||
first_real_screenshot = screenshot_b64
|
||||
break
|
||||
|
||||
if first_real_screenshot:
|
||||
@@ -162,14 +157,14 @@ def create_history_gif(
|
||||
else:
|
||||
logger.warning('No real screenshots found for task frame, skipping task frame')
|
||||
|
||||
# Process each history item
|
||||
for i, item in enumerate(history.history, 1):
|
||||
if not item.state.screenshot:
|
||||
# Process each history item with its corresponding screenshot
|
||||
for i, (item, screenshot) in enumerate(zip(history.history, screenshots), 1):
|
||||
if not screenshot:
|
||||
continue
|
||||
|
||||
# Skip placeholder screenshots from about:blank pages
|
||||
# These are 4x4 white PNGs encoded as a specific base64 string
|
||||
if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
|
||||
if screenshot == PLACEHOLDER_4PX_SCREENSHOT:
|
||||
logger.debug(f'Skipping placeholder screenshot from about:blank page at step {i}')
|
||||
continue
|
||||
|
||||
@@ -181,7 +176,7 @@ def create_history_gif(
|
||||
continue
|
||||
|
||||
# Convert base64 screenshot to PIL Image
|
||||
img_data = base64.b64decode(item.state.screenshot)
|
||||
img_data = base64.b64decode(screenshot)
|
||||
image = Image.open(io.BytesIO(img_data))
|
||||
|
||||
if show_goals and item.model_output:
|
||||
|
||||
@@ -9,7 +9,6 @@ from browser_use.agent.message_manager.views import (
|
||||
from browser_use.agent.prompts import AgentMessagePrompt
|
||||
from browser_use.agent.views import (
|
||||
ActionResult,
|
||||
AgentHistoryList,
|
||||
AgentOutput,
|
||||
AgentStepInfo,
|
||||
MessageManagerState,
|
||||
@@ -104,10 +103,8 @@ class MessageManager:
|
||||
state: MessageManagerState = MessageManagerState(),
|
||||
use_thinking: bool = True,
|
||||
include_attributes: list[str] | None = None,
|
||||
message_context: str | None = None,
|
||||
sensitive_data: dict[str, str | dict[str, str]] | None = None,
|
||||
max_history_items: int | None = None,
|
||||
images_per_step: int = 1,
|
||||
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
|
||||
include_tool_call_examples: bool = False,
|
||||
):
|
||||
@@ -118,7 +115,6 @@ class MessageManager:
|
||||
self.sensitive_data_description = ''
|
||||
self.use_thinking = use_thinking
|
||||
self.max_history_items = max_history_items
|
||||
self.images_per_step = images_per_step
|
||||
self.vision_detail_level = vision_detail_level
|
||||
self.include_tool_call_examples = include_tool_call_examples
|
||||
|
||||
@@ -126,7 +122,6 @@ class MessageManager:
|
||||
|
||||
# Store settings as direct attributes instead of in a settings object
|
||||
self.include_attributes = include_attributes or []
|
||||
self.message_context = message_context
|
||||
self.sensitive_data = sensitive_data
|
||||
self.last_input_messages = []
|
||||
# Only initialize messages if state is empty
|
||||
@@ -260,7 +255,6 @@ class MessageManager:
|
||||
use_vision=True,
|
||||
page_filtered_actions: str | None = None,
|
||||
sensitive_data=None,
|
||||
agent_history_list: AgentHistoryList | None = None, # Pass AgentHistoryList from agent
|
||||
available_file_paths: list[str] | None = None, # Always pass current available_file_paths
|
||||
) -> None:
|
||||
"""Add browser state as human message"""
|
||||
@@ -269,14 +263,8 @@ class MessageManager:
|
||||
if sensitive_data:
|
||||
self.sensitive_data_description = self._get_sensitive_data_description(browser_state_summary.url)
|
||||
|
||||
# Extract previous screenshots if we need more than 1 image and have agent history
|
||||
# Use only the current screenshot
|
||||
screenshots = []
|
||||
if agent_history_list and self.images_per_step > 1:
|
||||
# Get previous screenshots and filter out None values
|
||||
raw_screenshots = agent_history_list.screenshots(n_last=self.images_per_step - 1, return_none_if_not_screenshot=False)
|
||||
screenshots = [s for s in raw_screenshots if s is not None]
|
||||
|
||||
# add current screenshot to the end
|
||||
if browser_state_summary.screenshot:
|
||||
screenshots.append(browser_state_summary.screenshot)
|
||||
|
||||
|
||||
@@ -108,36 +108,6 @@ class AgentMessagePrompt:
|
||||
self.vision_detail_level = vision_detail_level
|
||||
assert self.browser_state
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='_deduplicate_screenshots')
|
||||
def _deduplicate_screenshots(self, screenshots: list[str]) -> list[str]:
|
||||
"""
|
||||
Remove consecutive duplicate screenshots, keeping only the most recent of each.
|
||||
|
||||
Args:
|
||||
screenshots: List of base64-encoded screenshot strings in chronological order (oldest first)
|
||||
|
||||
Returns:
|
||||
List of screenshots with consecutive duplicates removed, maintaining chronological order
|
||||
"""
|
||||
if not screenshots:
|
||||
return []
|
||||
|
||||
if len(screenshots) == 1:
|
||||
return screenshots
|
||||
|
||||
# Keep track of unique screenshots by comparing each with the next one
|
||||
unique_screenshots = []
|
||||
|
||||
for i in range(len(screenshots)):
|
||||
# Always keep the last screenshot
|
||||
if i == len(screenshots) - 1:
|
||||
unique_screenshots.append(screenshots[i])
|
||||
# Only keep screenshot if it's different from the next one
|
||||
elif screenshots[i] != screenshots[i + 1]:
|
||||
unique_screenshots.append(screenshots[i])
|
||||
|
||||
return unique_screenshots
|
||||
|
||||
@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
|
||||
def _get_browser_state_description(self) -> str:
|
||||
elements_text = self.browser_state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
|
||||
@@ -277,12 +247,9 @@ Available tabs:
|
||||
# Start with text description
|
||||
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]
|
||||
|
||||
# Deduplicate screenshots, keeping only the most recent of each unique image
|
||||
unique_screenshots = self._deduplicate_screenshots(self.screenshots)
|
||||
|
||||
# Add screenshots with labels
|
||||
for i, screenshot in enumerate(unique_screenshots):
|
||||
if i == len(unique_screenshots) - 1:
|
||||
for i, screenshot in enumerate(self.screenshots):
|
||||
if i == len(self.screenshots) - 1:
|
||||
label = 'Current screenshot:'
|
||||
else:
|
||||
# Use simple, accurate labeling since we don't have actual step timing info
|
||||
@@ -302,6 +269,6 @@ Available tabs:
|
||||
)
|
||||
)
|
||||
|
||||
return UserMessage(content=content_parts)
|
||||
return UserMessage(content=content_parts, cache=True)
|
||||
|
||||
return UserMessage(content=state_description)
|
||||
return UserMessage(content=state_description, cache=True)
|
||||
|
||||
@@ -3,7 +3,6 @@ import gc
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
@@ -163,7 +162,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
override_system_message: str | None = None,
|
||||
extend_system_message: str | None = None,
|
||||
validate_output: bool = False,
|
||||
message_context: str | None = None,
|
||||
generate_gif: bool | str = False,
|
||||
available_file_paths: list[str] | None = None,
|
||||
include_attributes: list[str] = DEFAULT_INCLUDE_ATTRIBUTES,
|
||||
@@ -171,7 +169,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
use_thinking: bool = True,
|
||||
flash_mode: bool = False,
|
||||
max_history_items: int = 40,
|
||||
images_per_step: int = 1,
|
||||
page_extraction_llm: BaseChatModel | None = None,
|
||||
planner_llm: BaseChatModel | None = None, # Deprecated
|
||||
planner_interval: int = 1, # Deprecated
|
||||
@@ -253,14 +250,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
override_system_message=override_system_message,
|
||||
extend_system_message=extend_system_message,
|
||||
validate_output=validate_output,
|
||||
message_context=message_context,
|
||||
generate_gif=generate_gif,
|
||||
include_attributes=include_attributes,
|
||||
max_actions_per_step=max_actions_per_step,
|
||||
use_thinking=use_thinking,
|
||||
flash_mode=flash_mode,
|
||||
max_history_items=max_history_items,
|
||||
images_per_step=images_per_step,
|
||||
page_extraction_llm=page_extraction_llm,
|
||||
planner_llm=None, # Always None now (deprecated)
|
||||
planner_interval=1, # Always 1 now (deprecated)
|
||||
@@ -281,8 +276,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
# Initialize state
|
||||
self.state = injected_agent_state or AgentState()
|
||||
|
||||
# Initialize file system
|
||||
# Initialize history
|
||||
self.history = AgentHistoryList(history=[], usage=None)
|
||||
|
||||
# Initialize agent directory
|
||||
import time
|
||||
|
||||
timestamp = int(time.time())
|
||||
base_tmp = Path(tempfile.gettempdir())
|
||||
self.agent_directory = base_tmp / f'browser_use_agent_{self.id}_{timestamp}'
|
||||
|
||||
# Initialize file system and screenshot service
|
||||
self._set_file_system(file_system_path)
|
||||
self._set_screenshot_service()
|
||||
|
||||
# Action setup
|
||||
self._setup_action_models()
|
||||
@@ -334,10 +340,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
use_thinking=self.settings.use_thinking,
|
||||
# Settings that were previously in MessageManagerSettings
|
||||
include_attributes=self.settings.include_attributes,
|
||||
message_context=self.settings.message_context,
|
||||
sensitive_data=sensitive_data,
|
||||
max_history_items=self.settings.max_history_items,
|
||||
images_per_step=self.settings.images_per_step,
|
||||
vision_detail_level=self.settings.vision_detail_level,
|
||||
include_tool_call_examples=self.settings.include_tool_call_examples,
|
||||
)
|
||||
@@ -562,10 +566,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
self.file_system = FileSystem(file_system_path)
|
||||
self.file_system_path = file_system_path
|
||||
else:
|
||||
# create a temporary file system using agent ID
|
||||
base_tmp = tempfile.gettempdir() # e.g., /tmp on Unix
|
||||
self.file_system_path = os.path.join(base_tmp, f'browser_use_agent_{self.id}')
|
||||
self.file_system = FileSystem(self.file_system_path)
|
||||
# Use the agent directory for file system
|
||||
self.file_system = FileSystem(self.agent_directory)
|
||||
self.file_system_path = str(self.agent_directory)
|
||||
except Exception as e:
|
||||
logger.error(f'💾 Failed to initialize file system: {e}.')
|
||||
raise e
|
||||
@@ -575,6 +578,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
|
||||
logger.info(f'💾 File system path: {self.file_system_path}')
|
||||
|
||||
def _set_screenshot_service(self) -> None:
|
||||
"""Initialize screenshot service using agent directory"""
|
||||
try:
|
||||
from browser_use.screenshots.service import ScreenshotService
|
||||
|
||||
self.screenshot_service = ScreenshotService(self.agent_directory)
|
||||
logger.info(f'📸 Screenshot service initialized in: {self.agent_directory}/screenshots')
|
||||
except Exception as e:
|
||||
logger.error(f'📸 Failed to initialize screenshot service: {e}.')
|
||||
raise e
|
||||
|
||||
def save_file_system_state(self) -> None:
|
||||
"""Save current file system state to agent state"""
|
||||
if self.file_system:
|
||||
@@ -583,9 +597,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
logger.error('💾 File system is not set up. Cannot save state.')
|
||||
raise ValueError('File system is not set up. Cannot save state.')
|
||||
|
||||
def _set_message_context(self) -> str | None:
|
||||
return self.settings.message_context
|
||||
|
||||
def _set_browser_use_version_and_source(self, source_override: str | None = None) -> None:
|
||||
"""Get the version from pyproject.toml and determine the source of the browser-use package"""
|
||||
# Use the helper function for version detection
|
||||
@@ -696,20 +707,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
|
||||
assert self.browser_session is not None, 'BrowserSession is not set up'
|
||||
|
||||
self.logger.debug(f'🌐 Step {self.state.n_steps + 1}: Getting browser state...')
|
||||
self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
|
||||
browser_state_summary = await self.browser_session.get_browser_state_with_recovery(
|
||||
cache_clickable_elements_hashes=True, include_screenshot=self.settings.use_vision
|
||||
)
|
||||
current_page = await self.browser_session.get_current_page()
|
||||
|
||||
# Check for new downloads after getting browser state (catches PDF auto-downloads and previous step downloads)
|
||||
await self._check_and_update_downloads(f'Step {self.state.n_steps + 1}: after getting browser state')
|
||||
await self._check_and_update_downloads(f'Step {self.state.n_steps}: after getting browser state')
|
||||
|
||||
self._log_step_context(current_page, browser_state_summary)
|
||||
await self._raise_if_stopped_or_paused()
|
||||
|
||||
# Update action models with page-specific actions
|
||||
self.logger.debug(f'📝 Step {self.state.n_steps + 1}: Updating action models...')
|
||||
self.logger.debug(f'📝 Step {self.state.n_steps}: Updating action models...')
|
||||
await self._update_action_models_for_page(current_page)
|
||||
|
||||
# Get page-specific filtered actions
|
||||
@@ -720,7 +731,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
page_action_message = f'For this page, these additional actions are available:\n{page_filtered_actions}'
|
||||
self._message_manager._add_message_with_type(UserMessage(content=page_action_message), 'consistent')
|
||||
|
||||
self.logger.debug(f'💬 Step {self.state.n_steps + 1}: Adding state message to context...')
|
||||
self.logger.debug(f'💬 Step {self.state.n_steps}: Adding state message to context...')
|
||||
self._message_manager.add_state_message(
|
||||
browser_state_summary=browser_state_summary,
|
||||
model_output=self.state.last_model_output,
|
||||
@@ -729,7 +740,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
use_vision=self.settings.use_vision,
|
||||
page_filtered_actions=page_filtered_actions if page_filtered_actions else None,
|
||||
sensitive_data=self.sensitive_data,
|
||||
agent_history_list=self.state.history, # Pass AgentHistoryList for screenshots
|
||||
available_file_paths=self.available_file_paths, # Always pass current available_file_paths
|
||||
)
|
||||
|
||||
@@ -741,7 +751,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
"""Execute LLM interaction with retry logic and handle callbacks"""
|
||||
input_messages = self._message_manager.get_messages()
|
||||
self.logger.debug(
|
||||
f'🤖 Step {self.state.n_steps + 1}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
|
||||
f'🤖 Step {self.state.n_steps}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -758,9 +768,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
# Check again for paused/stopped state after getting model output
|
||||
await self._raise_if_stopped_or_paused()
|
||||
|
||||
# Increment step counter at the start of each step
|
||||
self.state.n_steps += 1
|
||||
|
||||
# Handle callbacks and conversation saving
|
||||
await self._handle_post_llm_processing(browser_state_summary, input_messages)
|
||||
|
||||
@@ -854,7 +861,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
)
|
||||
|
||||
# Use _make_history_item like main branch
|
||||
self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)
|
||||
await self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)
|
||||
|
||||
# Log step completion summary
|
||||
self._log_step_completion_summary(self.step_start_time, self.state.last_result)
|
||||
@@ -877,6 +884,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
)
|
||||
self.eventbus.dispatch(step_event)
|
||||
|
||||
# Increment step counter after step is fully completed
|
||||
self.state.n_steps += 1
|
||||
|
||||
async def _handle_final_step(self, step_info: AgentStepInfo | None = None) -> None:
|
||||
"""Handle special processing for the last step"""
|
||||
if step_info and step_info.is_last_step():
|
||||
@@ -893,7 +903,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
"""Get model output with retry logic for empty actions"""
|
||||
model_output = await self.get_model_output(input_messages)
|
||||
self.logger.debug(
|
||||
f'✅ Step {self.state.n_steps + 1}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
|
||||
f'✅ Step {self.state.n_steps}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
|
||||
)
|
||||
|
||||
if (
|
||||
@@ -947,7 +957,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
self.settings.save_conversation_path_encoding,
|
||||
)
|
||||
|
||||
def _make_history_item(
|
||||
async def _make_history_item(
|
||||
self,
|
||||
model_output: AgentOutput | None,
|
||||
browser_state_summary: BrowserStateSummary,
|
||||
@@ -961,12 +971,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
else:
|
||||
interacted_elements = [None]
|
||||
|
||||
# Store screenshot and get path
|
||||
screenshot_path = None
|
||||
if browser_state_summary.screenshot:
|
||||
screenshot_path = await self.screenshot_service.store_screenshot(browser_state_summary.screenshot, self.state.n_steps)
|
||||
|
||||
state_history = BrowserStateHistory(
|
||||
url=browser_state_summary.url,
|
||||
title=browser_state_summary.title,
|
||||
tabs=browser_state_summary.tabs,
|
||||
interacted_element=interacted_elements,
|
||||
screenshot=browser_state_summary.screenshot,
|
||||
screenshot_path=screenshot_path,
|
||||
)
|
||||
|
||||
history_item = AgentHistory(
|
||||
@@ -976,7 +991,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
self.state.history.history.append(history_item)
|
||||
self.history.add_item(history_item)
|
||||
|
||||
def _remove_think_tags(self, text: str) -> str:
|
||||
THINK_TAGS = re.compile(r'<think>.*?</think>', re.DOTALL)
|
||||
@@ -1021,7 +1036,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
url_short = current_page.url[:50] + '...' if len(current_page.url) > 50 else current_page.url
|
||||
interactive_count = len(browser_state_summary.selector_map) if browser_state_summary else 0
|
||||
self.logger.info(
|
||||
f'📍 Step {self.state.n_steps + 1}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
|
||||
f'📍 Step {self.state.n_steps}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
|
||||
)
|
||||
|
||||
def _log_next_action_summary(self, parsed: 'AgentOutput') -> None:
|
||||
@@ -1094,7 +1109,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
|
||||
# Prepare action_history data correctly
|
||||
action_history_data = []
|
||||
for item in self.state.history.history:
|
||||
for item in self.history.history:
|
||||
if item.model_output and item.model_output.action:
|
||||
# Convert each ActionModel in the step to its dictionary representation
|
||||
step_actions = [
|
||||
@@ -1107,7 +1122,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
# Append None or [] if a step had no actions or no model output
|
||||
action_history_data.append(None)
|
||||
|
||||
final_res = self.state.history.final_result()
|
||||
final_res = self.history.final_result()
|
||||
final_result_str = json.dumps(final_res) if final_res is not None else None
|
||||
|
||||
self.telemetry.capture(
|
||||
@@ -1125,13 +1140,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
cdp_url=urlparse(self.browser_session.cdp_url).hostname
|
||||
if self.browser_session and self.browser_session.cdp_url
|
||||
else None,
|
||||
action_errors=self.state.history.errors(),
|
||||
action_errors=self.history.errors(),
|
||||
action_history=action_history_data,
|
||||
urls_visited=self.state.history.urls(),
|
||||
urls_visited=self.history.urls(),
|
||||
steps=self.state.n_steps,
|
||||
total_input_tokens=token_summary.prompt_tokens,
|
||||
total_duration_seconds=self.state.history.total_duration_seconds(),
|
||||
success=self.state.history.is_successful(),
|
||||
total_duration_seconds=self.history.total_duration_seconds(),
|
||||
success=self.history.is_successful(),
|
||||
final_result_response=final_result_str,
|
||||
error_message=agent_run_error,
|
||||
)
|
||||
@@ -1145,13 +1160,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
"""
|
||||
await self.step(step_info)
|
||||
|
||||
if self.state.history.is_done():
|
||||
if self.history.is_done():
|
||||
await self.log_completion()
|
||||
if self.register_done_callback:
|
||||
if inspect.iscoroutinefunction(self.register_done_callback):
|
||||
await self.register_done_callback(self.state.history)
|
||||
await self.register_done_callback(self.history)
|
||||
else:
|
||||
self.register_done_callback(self.state.history)
|
||||
self.register_done_callback(self.history)
|
||||
return True, True
|
||||
|
||||
return False, False
|
||||
@@ -1271,22 +1286,22 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
if on_step_end is not None:
|
||||
await on_step_end(self)
|
||||
|
||||
if self.state.history.is_done():
|
||||
if self.history.is_done():
|
||||
self.logger.debug(f'🎯 Task completed after {step + 1} steps!')
|
||||
await self.log_completion()
|
||||
|
||||
if self.register_done_callback:
|
||||
if inspect.iscoroutinefunction(self.register_done_callback):
|
||||
await self.register_done_callback(self.state.history)
|
||||
await self.register_done_callback(self.history)
|
||||
else:
|
||||
self.register_done_callback(self.state.history)
|
||||
self.register_done_callback(self.history)
|
||||
|
||||
# Task completed
|
||||
break
|
||||
else:
|
||||
agent_run_error = 'Failed to complete task in maximum steps'
|
||||
|
||||
self.state.history.history.append(
|
||||
self.history.add_item(
|
||||
AgentHistory(
|
||||
model_output=None,
|
||||
result=[ActionResult(error=agent_run_error, include_in_memory=True)],
|
||||
@@ -1295,7 +1310,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
title='',
|
||||
tabs=[],
|
||||
interacted_element=[],
|
||||
screenshot=None,
|
||||
screenshot_path=None,
|
||||
),
|
||||
metadata=None,
|
||||
)
|
||||
@@ -1304,23 +1319,23 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
self.logger.info(f'❌ {agent_run_error}')
|
||||
|
||||
self.logger.debug('📊 Collecting usage summary...')
|
||||
self.state.history.usage = await self.token_cost_service.get_usage_summary()
|
||||
self.history.usage = await self.token_cost_service.get_usage_summary()
|
||||
|
||||
# set the model output schema and call it on the fly
|
||||
if self.state.history._output_model_schema is None and self.output_model_schema is not None:
|
||||
self.state.history._output_model_schema = self.output_model_schema
|
||||
if self.history._output_model_schema is None and self.output_model_schema is not None:
|
||||
self.history._output_model_schema = self.output_model_schema
|
||||
|
||||
self.logger.debug('🏁 Agent.run() completed successfully')
|
||||
return self.state.history
|
||||
return self.history
|
||||
|
||||
except KeyboardInterrupt:
|
||||
# Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
|
||||
self.logger.info('Got KeyboardInterrupt during execution, returning current history')
|
||||
agent_run_error = 'KeyboardInterrupt'
|
||||
|
||||
self.state.history.usage = await self.token_cost_service.get_usage_summary()
|
||||
self.history.usage = await self.token_cost_service.get_usage_summary()
|
||||
|
||||
return self.state.history
|
||||
return self.history
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f'Agent run failed with exception: {e}', exc_info=True)
|
||||
@@ -1359,7 +1374,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
# Lazy import gif module to avoid heavy startup cost
|
||||
from browser_use.agent.gif import create_history_gif
|
||||
|
||||
create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
|
||||
create_history_gif(task=self.task, history=self.history, output_path=output_path)
|
||||
|
||||
# Only emit output file event if GIF was actually created
|
||||
if Path(output_path).exists():
|
||||
@@ -1484,7 +1499,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
|
||||
async def log_completion(self) -> None:
|
||||
"""Log the completion of the task"""
|
||||
if self.state.history.is_successful():
|
||||
if self.history.is_successful():
|
||||
self.logger.info('✅ Task completed successfully')
|
||||
else:
|
||||
self.logger.info('❌ Task completed without success')
|
||||
@@ -1618,7 +1633,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
"""Save the history to a file"""
|
||||
if not file_path:
|
||||
file_path = 'AgentHistory.json'
|
||||
self.state.history.save_to_file(file_path)
|
||||
self.history.save_to_file(file_path)
|
||||
|
||||
async def wait_until_resumed(self):
|
||||
await self._external_pause_event.wait()
|
||||
@@ -1756,14 +1771,14 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
timestamp = datetime.now().isoformat()
|
||||
|
||||
# Only declare variables that are used multiple times
|
||||
structured_output = self.state.history.structured_output
|
||||
structured_output = self.history.structured_output
|
||||
structured_output_json = json.dumps(structured_output.model_dump()) if structured_output else None
|
||||
final_result = self.state.history.final_result()
|
||||
final_result = self.history.final_result()
|
||||
git_info = get_git_info()
|
||||
action_history = self.state.history.action_history()
|
||||
action_errors = self.state.history.errors()
|
||||
urls = self.state.history.urls()
|
||||
usage = self.state.history.usage
|
||||
action_history = self.history.action_history()
|
||||
action_errors = self.history.errors()
|
||||
urls = self.history.urls()
|
||||
usage = self.history.usage
|
||||
|
||||
return {
|
||||
'trace': {
|
||||
@@ -1790,10 +1805,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
'final_result_response_truncated': (
|
||||
final_result[:20000] if final_result and len(final_result) > 20000 else final_result
|
||||
),
|
||||
'self_report_completed': 1 if self.state.history.is_done() else 0,
|
||||
'self_report_success': 1 if self.state.history.is_successful() else 0,
|
||||
'duration': self.state.history.total_duration_seconds(),
|
||||
'steps_taken': self.state.history.number_of_steps(),
|
||||
'self_report_completed': 1 if self.history.is_done() else 0,
|
||||
'self_report_success': 1 if self.history.is_successful() else 0,
|
||||
'duration': self.history.total_duration_seconds(),
|
||||
'steps_taken': self.history.number_of_steps(),
|
||||
'usage': json.dumps(usage.model_dump()) if usage else None,
|
||||
},
|
||||
'trace_details': {
|
||||
@@ -1805,6 +1820,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
|
||||
# AgentHistoryList methods
|
||||
'structured_output': structured_output_json,
|
||||
'final_result_response': final_result,
|
||||
'complete_history': _get_complete_history_without_screenshots(self.state.history.model_dump()),
|
||||
'complete_history': _get_complete_history_without_screenshots(self.history.model_dump()),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -36,7 +36,6 @@ class AgentSettings(BaseModel):
|
||||
max_failures: int = 3
|
||||
retry_delay: int = 10
|
||||
validate_output: bool = False
|
||||
message_context: str | None = None
|
||||
generate_gif: bool | str = False
|
||||
override_system_message: str | None = None
|
||||
extend_system_message: str | None = None
|
||||
@@ -56,7 +55,6 @@ class AgentSettings(BaseModel):
|
||||
use_thinking: bool = True
|
||||
flash_mode: bool = False # If enabled, disables evaluation_previous_goal and next_goal, and sets use_thinking = False
|
||||
max_history_items: int = 40
|
||||
images_per_step: int = 1
|
||||
|
||||
page_extraction_llm: BaseChatModel | None = None
|
||||
planner_llm: BaseChatModel | None = None
|
||||
@@ -76,7 +74,6 @@ class AgentState(BaseModel):
|
||||
n_steps: int = 1
|
||||
consecutive_failures: int = 0
|
||||
last_result: list[ActionResult] | None = None
|
||||
history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[], usage=None))
|
||||
last_plan: str | None = None
|
||||
last_model_output: AgentOutput | None = None
|
||||
paused: bool = False
|
||||
@@ -329,6 +326,10 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
|
||||
"""Representation of the AgentHistoryList object"""
|
||||
return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})'
|
||||
|
||||
def add_item(self, history_item: AgentHistory) -> None:
|
||||
"""Add a history item to the list"""
|
||||
self.history.append(history_item)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Representation of the AgentHistoryList object"""
|
||||
return self.__str__()
|
||||
@@ -443,20 +444,39 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
|
||||
"""Get all unique URLs from history"""
|
||||
return [h.state.url if h.state.url is not None else None for h in self.history]
|
||||
|
||||
def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
|
||||
"""Get all screenshots from history"""
|
||||
def screenshot_paths(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
|
||||
"""Get all screenshot paths from history"""
|
||||
if n_last == 0:
|
||||
return []
|
||||
if n_last is None:
|
||||
if return_none_if_not_screenshot:
|
||||
return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history]
|
||||
return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history]
|
||||
else:
|
||||
return [h.state.screenshot for h in self.history if h.state.screenshot is not None]
|
||||
return [h.state.screenshot_path for h in self.history if h.state.screenshot_path is not None]
|
||||
else:
|
||||
if return_none_if_not_screenshot:
|
||||
return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history[-n_last:]]
|
||||
return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history[-n_last:]]
|
||||
else:
|
||||
return [h.state.screenshot for h in self.history[-n_last:] if h.state.screenshot is not None]
|
||||
return [h.state.screenshot_path for h in self.history[-n_last:] if h.state.screenshot_path is not None]
|
||||
|
||||
def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
|
||||
"""Get all screenshots from history as base64 strings"""
|
||||
if n_last == 0:
|
||||
return []
|
||||
|
||||
history_items = self.history if n_last is None else self.history[-n_last:]
|
||||
screenshots = []
|
||||
|
||||
for item in history_items:
|
||||
screenshot_b64 = item.state.get_screenshot()
|
||||
if screenshot_b64:
|
||||
screenshots.append(screenshot_b64)
|
||||
else:
|
||||
if return_none_if_not_screenshot:
|
||||
screenshots.append(None)
|
||||
# If return_none_if_not_screenshot is False, we skip None values
|
||||
|
||||
return screenshots
|
||||
|
||||
def action_names(self) -> list[str]:
|
||||
"""Get all action names from history"""
|
||||
|
||||
@@ -81,12 +81,31 @@ class BrowserStateHistory:
|
||||
title: str
|
||||
tabs: list[TabInfo]
|
||||
interacted_element: list[DOMHistoryElement | None] | list[None]
|
||||
screenshot: str | None = None
|
||||
screenshot_path: str | None = None
|
||||
|
||||
def get_screenshot(self) -> str | None:
|
||||
"""Load screenshot from disk and return as base64 string"""
|
||||
if not self.screenshot_path:
|
||||
return None
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
path_obj = Path(self.screenshot_path)
|
||||
if not path_obj.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(path_obj, 'rb') as f:
|
||||
screenshot_data = f.read()
|
||||
return base64.b64encode(screenshot_data).decode('utf-8')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = {}
|
||||
data['tabs'] = [tab.model_dump() for tab in self.tabs]
|
||||
data['screenshot'] = self.screenshot
|
||||
data['screenshot_path'] = self.screenshot_path
|
||||
data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
|
||||
data['url'] = self.url
|
||||
data['title'] = self.title
|
||||
|
||||
@@ -815,18 +815,18 @@ class BrowserUseApp(App):
|
||||
# Show token usage statistics if agent exists and has history
|
||||
if self.agent and hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
|
||||
# Get total tokens used
|
||||
# total_tokens = self.agent.state.history.total_input_tokens()
|
||||
# total_tokens = self.agent.history.total_input_tokens()
|
||||
# model_info.write(f'[white]Input tokens:[/] [green]{total_tokens:,}[/]')
|
||||
|
||||
# Calculate tokens per step
|
||||
num_steps = len(self.agent.state.history.history)
|
||||
num_steps = len(self.agent.history.history)
|
||||
# if num_steps > 0:
|
||||
# avg_tokens_per_step = total_tokens / num_steps
|
||||
# model_info.write(f'[white]Avg tokens/step:[/] [green]{avg_tokens_per_step:,.1f}[/]')
|
||||
|
||||
# Get the last step metadata to show the most recent LLM response time
|
||||
if num_steps > 0 and self.agent.state.history.history[-1].metadata:
|
||||
last_step = self.agent.state.history.history[-1]
|
||||
if num_steps > 0 and self.agent.history.history[-1].metadata:
|
||||
last_step = self.agent.history.history[-1]
|
||||
if last_step.metadata:
|
||||
step_duration = last_step.metadata.duration_seconds
|
||||
else:
|
||||
@@ -838,7 +838,7 @@ class BrowserUseApp(App):
|
||||
# model_info.write(f'[white]Avg tokens/sec:[/] [magenta]{tokens_per_second:.1f}[/]')
|
||||
|
||||
# Show total duration
|
||||
total_duration = self.agent.state.history.total_duration_seconds()
|
||||
total_duration = self.agent.history.total_duration_seconds()
|
||||
if total_duration > 0:
|
||||
model_info.write(f'[white]Total Duration:[/] [magenta]{total_duration:.2f}s[/]')
|
||||
|
||||
@@ -891,7 +891,7 @@ class BrowserUseApp(App):
|
||||
# Get all agent history items
|
||||
history_items = []
|
||||
if hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
|
||||
history_items = self.agent.state.history.history
|
||||
history_items = self.agent.history.history
|
||||
|
||||
if history_items:
|
||||
tasks_info.write('[bold yellow]STEPS:[/]')
|
||||
|
||||
@@ -371,6 +371,7 @@ class ChatGoogle(BaseChatModel):
|
||||
key == 'properties'
|
||||
and isinstance(cleaned_value, dict)
|
||||
and len(cleaned_value) == 0
|
||||
and isinstance(obj.get('type', ''), str)
|
||||
and obj.get('type', '').upper() == 'OBJECT'
|
||||
):
|
||||
# Convert empty object to have at least one property
|
||||
@@ -380,7 +381,8 @@ class ChatGoogle(BaseChatModel):
|
||||
|
||||
# If this is an object type with empty properties, add a placeholder
|
||||
if (
|
||||
cleaned.get('type', '').upper() == 'OBJECT'
|
||||
isinstance(cleaned.get('type', ''), str)
|
||||
and cleaned.get('type', '').upper() == 'OBJECT'
|
||||
and 'properties' in cleaned
|
||||
and isinstance(cleaned['properties'], dict)
|
||||
and len(cleaned['properties']) == 0
|
||||
|
||||
1
browser_use/screenshots/__init__.py
Normal file
1
browser_use/screenshots/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Screenshots package for browser-use
|
||||
48
browser_use/screenshots/service.py
Normal file
48
browser_use/screenshots/service.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Screenshot storage service for browser-use agents.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
import anyio
|
||||
|
||||
|
||||
class ScreenshotService:
|
||||
"""Simple screenshot storage service that saves screenshots to disk"""
|
||||
|
||||
def __init__(self, agent_directory: str | Path):
|
||||
"""Initialize with agent directory path"""
|
||||
self.agent_directory = Path(agent_directory) if isinstance(agent_directory, str) else agent_directory
|
||||
|
||||
# Create screenshots subdirectory
|
||||
self.screenshots_dir = self.agent_directory / 'screenshots'
|
||||
self.screenshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async def store_screenshot(self, screenshot_b64: str, step_number: int) -> str:
|
||||
"""Store screenshot to disk and return the full path as string"""
|
||||
screenshot_filename = f'step_{step_number}.png'
|
||||
screenshot_path = self.screenshots_dir / screenshot_filename
|
||||
|
||||
# Decode base64 and save to disk
|
||||
screenshot_data = base64.b64decode(screenshot_b64)
|
||||
|
||||
async with await anyio.open_file(screenshot_path, 'wb') as f:
|
||||
await f.write(screenshot_data)
|
||||
|
||||
return str(screenshot_path)
|
||||
|
||||
async def get_screenshot(self, screenshot_path: str) -> str | None:
|
||||
"""Load screenshot from disk path and return as base64"""
|
||||
if not screenshot_path:
|
||||
return None
|
||||
|
||||
path = Path(screenshot_path)
|
||||
if not path.exists():
|
||||
return None
|
||||
|
||||
# Load from disk and encode to base64
|
||||
async with await anyio.open_file(path, 'rb') as f:
|
||||
screenshot_data = await f.read()
|
||||
|
||||
return base64.b64encode(screenshot_data).decode('utf-8')
|
||||
@@ -133,7 +133,7 @@ history = await agent.run()
|
||||
|
||||
# Access (some) useful information
|
||||
history.urls() # List of visited URLs
|
||||
history.screenshots() # List of screenshot paths
|
||||
history.screenshot_paths() # List of screenshot paths
|
||||
history.action_names() # Names of executed actions
|
||||
history.extracted_content() # Content extracted during execution
|
||||
history.errors() # Any errors that occurred
|
||||
@@ -173,60 +173,12 @@ agent = Agent(
|
||||
)
|
||||
```
|
||||
|
||||
## Run with message context
|
||||
|
||||
You can configure the agent and provide a separate message to help the LLM understand the task better.
|
||||
|
||||
```python
|
||||
from browser_use.llm import ChatOpenAI
|
||||
|
||||
agent = Agent(
|
||||
task="your task",
|
||||
message_context="Additional information about the task",
|
||||
llm = ChatOpenAI(model='gpt-4o')
|
||||
)
|
||||
```
|
||||
|
||||
## Run with planner model
|
||||
|
||||
You can configure the agent to use a separate planner model for high-level task planning:
|
||||
|
||||
```python
|
||||
from browser_use.llm import ChatOpenAI
|
||||
|
||||
# Initialize models
|
||||
llm = ChatOpenAI(model='gpt-4o')
|
||||
planner_llm = ChatOpenAI(model='o3-mini')
|
||||
|
||||
agent = Agent(
|
||||
task="your task",
|
||||
llm=llm,
|
||||
planner_llm=planner_llm, # Separate model for planning
|
||||
use_vision_for_planner=False, # Disable vision for planner
|
||||
planner_interval=4 # Plan every 4 steps
|
||||
)
|
||||
```
|
||||
|
||||
### Planner Parameters
|
||||
|
||||
- `planner_llm`: A chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM.
|
||||
- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`.
|
||||
- `planner_interval`: Number of steps between planning phases. Defaults to `1`.
|
||||
|
||||
Using a separate planner model can help:
|
||||
|
||||
- Reduce costs by using a smaller model for high-level planning
|
||||
- Improve task decomposition and strategic thinking
|
||||
- Better handle complex, multi-step tasks
|
||||
|
||||
<Note>
|
||||
The planner model is optional. If not specified, the agent will not use the
|
||||
planner model.
|
||||
</Note>
|
||||
|
||||
### Optional Parameters
|
||||
|
||||
- `message_context`: Additional information about the task to help the LLM understand the task better.
|
||||
- `initial_actions`: List of initial actions to run before the main task.
|
||||
- `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`.
|
||||
- `max_failures`: Maximum number of failures before giving up. Defaults to `3`.
|
||||
|
||||
@@ -41,7 +41,7 @@ async def my_step_hook(agent: Agent):
|
||||
# https://playwright.dev/python/docs/api/class-page
|
||||
|
||||
current_url = page.url
|
||||
visit_log = agent.state.history.urls()
|
||||
visit_log = agent.history.urls()
|
||||
previous_url = visit_log[-2] if len(visit_log) >= 2 else None
|
||||
print(f"Agent was last on URL: {previous_url} and is now on {current_url}")
|
||||
|
||||
@@ -91,11 +91,12 @@ When working with agent hooks, you have access to the entire `Agent` instance. H
|
||||
- `agent.settings` contains all the configuration options passed to the `Agent(...)` at init time
|
||||
- `agent.llm` gives direct access to the main LLM object (e.g. `ChatOpenAI`)
|
||||
- `agent.state` gives access to lots of internal state, including agent thoughts, outputs, actions, etc.
|
||||
- `agent.state.history.model_thoughts()`: Reasoning from Browser Use's model.
|
||||
- `agent.state.history.model_outputs()`: Raw outputs from the Browsre Use's model.
|
||||
- `agent.state.history.model_actions()`: Actions taken by the agent
|
||||
- `agent.state.history.extracted_content()`: Content extracted from web pages
|
||||
- `agent.state.history.urls()`: URLs visited by the agent
|
||||
- `agent.history` gives access to historical data from the agent's execution:
|
||||
- `agent.history.model_thoughts()`: Reasoning from Browser Use's model.
|
||||
- `agent.history.model_outputs()`: Raw outputs from the Browser Use's model.
|
||||
- `agent.history.model_actions()`: Actions taken by the agent
|
||||
- `agent.history.extracted_content()`: Content extracted from web pages
|
||||
- `agent.history.urls()`: URLs visited by the agent
|
||||
- `agent.browser_session` gives direct access to the `BrowserSession()` and playwright objects
|
||||
- `agent.browser_session.get_current_page()`: Get the current playwright `Page` object the agent is focused on
|
||||
- `agent.browser_session.browser_context`: Get the current playwright `BrowserContext` object
|
||||
|
||||
@@ -154,7 +154,7 @@ async def record_activity(agent_obj):
|
||||
print('--> History:')
|
||||
# Assert agent has state to satisfy type checker
|
||||
assert hasattr(agent_obj, 'state'), 'Agent must have state attribute'
|
||||
history = agent_obj.state.history
|
||||
history = agent_obj.history
|
||||
|
||||
model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)
|
||||
|
||||
@@ -164,7 +164,7 @@ async def record_activity(agent_obj):
|
||||
# prettyprinter.cpprint(model_thoughts_last_elem)
|
||||
|
||||
# print("--- MODEL OUTPUT ACTION ---")
|
||||
model_outputs = agent_obj.state.history.model_outputs()
|
||||
model_outputs = agent_obj.history.model_outputs()
|
||||
model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False)
|
||||
|
||||
if len(model_outputs_json) > 0:
|
||||
@@ -172,7 +172,7 @@ async def record_activity(agent_obj):
|
||||
# prettyprinter.cpprint(model_outputs_json_last_elem)
|
||||
|
||||
# print("--- MODEL INTERACTED ELEM ---")
|
||||
model_actions = agent_obj.state.history.model_actions()
|
||||
model_actions = agent_obj.history.model_actions()
|
||||
model_actions_json = obj_to_json(obj=model_actions, check_circular=False)
|
||||
|
||||
if len(model_actions_json) > 0:
|
||||
@@ -180,14 +180,14 @@ async def record_activity(agent_obj):
|
||||
# prettyprinter.cpprint(model_actions_json_last_elem)
|
||||
|
||||
# print("--- EXTRACTED CONTENT ---")
|
||||
extracted_content = agent_obj.state.history.extracted_content()
|
||||
extracted_content = agent_obj.history.extracted_content()
|
||||
extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False)
|
||||
if len(extracted_content_json) > 0:
|
||||
extracted_content_json_last_elem = extracted_content_json[-1]
|
||||
# prettyprinter.cpprint(extracted_content_json_last_elem)
|
||||
|
||||
# print("--- URLS ---")
|
||||
urls = agent_obj.state.history.urls()
|
||||
urls = agent_obj.history.urls()
|
||||
# prettyprinter.cpprint(urls)
|
||||
urls_json = obj_to_json(obj=urls, check_circular=False)
|
||||
|
||||
|
||||
@@ -47,8 +47,6 @@ async def main():
|
||||
if done and valid:
|
||||
break
|
||||
|
||||
agent_state.history.history = []
|
||||
|
||||
# Save state to file
|
||||
async with await anyio.open_file('agent_state.json', 'w') as f:
|
||||
serialized = agent_state.model_dump_json(exclude={'history'})
|
||||
|
||||
@@ -28,7 +28,7 @@ async def main():
|
||||
task='What should we pay attention to in the recent new rules on tariffs in China-US trade?',
|
||||
llm=llm,
|
||||
use_vision=False,
|
||||
message_context=extend_system_message,
|
||||
extend_system_message=extend_system_message,
|
||||
)
|
||||
await agent.run()
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
name = "browser-use"
|
||||
description = "Make websites accessible for AI agents"
|
||||
authors = [{ name = "Gregor Zunic" }]
|
||||
version = "0.5.7"
|
||||
version = "0.5.9"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11,<4.0"
|
||||
classifiers = [
|
||||
|
||||
107
tests/ci/test_gemini_type_field_fix.py
Normal file
107
tests/ci/test_gemini_type_field_fix.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
Test to reproduce and verify fix for GitHub issue #2470:
|
||||
"Python field with name 'type' handled differently between Gemini and OpenAI GPT"
|
||||
"""
|
||||
|
||||
from browser_use.llm.google.chat import ChatGoogle
|
||||
from browser_use.llm.schema import SchemaOptimizer
|
||||
|
||||
|
||||
class TestGeminiTypeFieldHandling:
|
||||
"""Test class for reproducing the type field issue with Gemini schema processing."""
|
||||
|
||||
def test_gemini_schema_with_dict_type_field(self):
|
||||
"""
|
||||
Test that Gemini schema processing handles dict 'type' field gracefully.
|
||||
Reproduces the AttributeError: 'dict' object has no attribute 'upper'
|
||||
"""
|
||||
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
|
||||
|
||||
# Schema with dict instead of string in type field
|
||||
problematic_schema = {'type': {'malformed': 'dict_type'}, 'properties': {}}
|
||||
|
||||
result = chat_google._fix_gemini_schema(problematic_schema)
|
||||
assert result is not None
|
||||
assert isinstance(result, dict)
|
||||
assert result['type'] == {'malformed': 'dict_type'}
|
||||
|
||||
def test_gemini_schema_with_nested_dict_type_field(self):
|
||||
"""
|
||||
Test that nested dict 'type' fields are handled gracefully.
|
||||
"""
|
||||
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
|
||||
|
||||
# Schema with nested dict type field
|
||||
problematic_schema = {
|
||||
'type': 'object',
|
||||
'properties': {'nested_field': {'type': {'malformed': 'dict_instead_of_string'}, 'properties': {}}},
|
||||
}
|
||||
|
||||
result = chat_google._fix_gemini_schema(problematic_schema)
|
||||
assert result is not None
|
||||
assert isinstance(result, dict)
|
||||
nested_type = result['properties']['nested_field']['type']
|
||||
assert nested_type == {'malformed': 'dict_instead_of_string'}
|
||||
|
||||
def test_gemini_schema_with_none_type_field(self):
|
||||
"""Test handling of None type field."""
|
||||
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
|
||||
|
||||
problematic_schema = {'type': 'object', 'properties': {'nested_field': {'type': None, 'properties': {}}}}
|
||||
|
||||
result = chat_google._fix_gemini_schema(problematic_schema)
|
||||
assert result is not None
|
||||
|
||||
def test_gemini_schema_with_valid_string_type(self):
|
||||
"""Test that valid string type fields work correctly."""
|
||||
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
|
||||
|
||||
valid_schema = {'type': 'object', 'properties': {'nested_field': {'type': 'object', 'properties': {}}}}
|
||||
|
||||
# Should work without issues
|
||||
result = chat_google._fix_gemini_schema(valid_schema)
|
||||
assert result is not None
|
||||
assert isinstance(result, dict)
|
||||
|
||||
def test_gemini_schema_with_empty_properties_object(self):
|
||||
"""Test handling of empty properties in object type."""
|
||||
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
|
||||
|
||||
schema_with_empty_props = {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'empty_object': {
|
||||
'type': 'object',
|
||||
'properties': {}, # Empty properties should get placeholder
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
result = chat_google._fix_gemini_schema(schema_with_empty_props)
|
||||
|
||||
nested_props = result['properties']['empty_object']['properties']
|
||||
assert '_placeholder' in nested_props
|
||||
assert nested_props['_placeholder']['type'] == 'string'
|
||||
|
||||
def test_consistency_between_providers(self):
|
||||
"""
|
||||
Test that both Gemini and OpenAI handle schemas consistently.
|
||||
The original issue was that Gemini would fail where OpenAI succeeded.
|
||||
"""
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Create a test model that generates a schema with dict type
|
||||
class TestModel(BaseModel):
|
||||
field_with_dict_type: dict = Field(default_factory=dict)
|
||||
|
||||
# OpenAI uses SchemaOptimizer directly
|
||||
openai_schema = SchemaOptimizer.create_optimized_json_schema(TestModel)
|
||||
assert openai_schema is not None
|
||||
|
||||
# Gemini processes the schema through _fix_gemini_schema
|
||||
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
|
||||
gemini_result = chat_google._fix_gemini_schema(openai_schema)
|
||||
assert gemini_result is not None
|
||||
|
||||
# Both should handle the schema without errors
|
||||
# This demonstrates that the fix makes Gemini consistent with OpenAI
|
||||
@@ -9,7 +9,7 @@ from PIL import Image
|
||||
from browser_use import AgentHistoryList
|
||||
from browser_use.agent.gif import create_history_gif
|
||||
from browser_use.agent.views import ActionResult, AgentHistory, AgentOutput
|
||||
from browser_use.browser.views import PLACEHOLDER_4PX_SCREENSHOT, BrowserStateHistory, TabInfo
|
||||
from browser_use.browser.views import BrowserStateHistory, TabInfo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -49,9 +49,22 @@ def create_test_screenshot(width: int = 800, height: int = 600, color: tuple = (
|
||||
|
||||
async def test_gif_filters_out_placeholder_screenshots(test_dir):
|
||||
"""Test that 4px placeholder screenshots from about:blank pages are filtered out of GIFs."""
|
||||
# Set up screenshot service for testing (still needed to create test files)
|
||||
from browser_use.screenshots.service import ScreenshotService
|
||||
|
||||
screenshot_service = ScreenshotService(test_dir)
|
||||
|
||||
# Helper function to store test screenshots
|
||||
async def store_test_screenshot(screenshot_b64: str, step: int) -> str:
|
||||
return await screenshot_service.store_screenshot(screenshot_b64, step)
|
||||
|
||||
# Create a history with mixed screenshots: real and placeholder
|
||||
history_items = []
|
||||
|
||||
# Store test screenshots
|
||||
real_screenshot_1_path = await store_test_screenshot(create_test_screenshot(800, 600, (100, 150, 200)), 2)
|
||||
real_screenshot_2_path = await store_test_screenshot(create_test_screenshot(800, 600, (200, 100, 50)), 4)
|
||||
|
||||
# First item: about:blank placeholder (should be filtered)
|
||||
history_items.append(
|
||||
AgentHistory(
|
||||
@@ -63,7 +76,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
|
||||
),
|
||||
result=[ActionResult()],
|
||||
state=BrowserStateHistory(
|
||||
screenshot=PLACEHOLDER_4PX_SCREENSHOT,
|
||||
screenshot_path=None, # Placeholder doesn't have a file path
|
||||
url='about:blank',
|
||||
title='New Tab',
|
||||
tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],
|
||||
@@ -83,7 +96,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
|
||||
),
|
||||
result=[ActionResult()],
|
||||
state=BrowserStateHistory(
|
||||
screenshot=create_test_screenshot(800, 600, (100, 150, 200)),
|
||||
screenshot_path=real_screenshot_1_path,
|
||||
url='https://example.com',
|
||||
title='Example',
|
||||
tabs=[TabInfo(page_id=1, url='https://example.com', title='Example')],
|
||||
@@ -103,7 +116,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
|
||||
),
|
||||
result=[ActionResult()],
|
||||
state=BrowserStateHistory(
|
||||
screenshot=PLACEHOLDER_4PX_SCREENSHOT,
|
||||
screenshot_path=None, # Placeholder doesn't have a file path
|
||||
url='about:blank',
|
||||
title='New Tab',
|
||||
tabs=[TabInfo(page_id=2, url='about:blank', title='New Tab')],
|
||||
@@ -123,7 +136,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
|
||||
),
|
||||
result=[ActionResult()],
|
||||
state=BrowserStateHistory(
|
||||
screenshot=create_test_screenshot(800, 600, (200, 100, 50)),
|
||||
screenshot_path=real_screenshot_2_path,
|
||||
url='https://example.com/page2',
|
||||
title='Page 2',
|
||||
tabs=[TabInfo(page_id=1, url='https://example.com/page2', title='Page 2')],
|
||||
@@ -190,7 +203,7 @@ async def test_gif_handles_all_placeholders(test_dir):
|
||||
),
|
||||
result=[ActionResult()],
|
||||
state=BrowserStateHistory(
|
||||
screenshot=PLACEHOLDER_4PX_SCREENSHOT,
|
||||
screenshot_path=None, # Placeholder doesn't have a file path
|
||||
url='about:blank',
|
||||
title='New Tab',
|
||||
tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],
|
||||
|
||||
@@ -85,9 +85,10 @@ async def test_gif_generation_with_real_navigation(httpserver, tmp_path):
|
||||
# Verify history contains real screenshots (not placeholders)
|
||||
has_real_screenshot = False
|
||||
for item in history.history:
|
||||
screenshot_b64 = item.state.get_screenshot()
|
||||
if (
|
||||
item.state.screenshot
|
||||
and item.state.screenshot
|
||||
screenshot_b64
|
||||
and screenshot_b64
|
||||
!= 'iVBORw0KGgoAAAANSUhEUgAAAAQAAAAECAIAAAAmkwkpAAAAFElEQVR4nGP8//8/AwwwMSAB3BwAlm4DBfIlvvkAAAAASUVORK5CYII='
|
||||
):
|
||||
has_real_screenshot = True
|
||||
|
||||
@@ -88,7 +88,7 @@ class TestAgentEventLifecycle:
|
||||
|
||||
assert isinstance(step_event, CreateAgentStepEvent)
|
||||
assert step_event.agent_task_id == task_event.id
|
||||
assert step_event.step == 2 # Step is incremented before event is emitted
|
||||
assert step_event.step == 1 # Step is incremented before event is emitted
|
||||
assert step_event.url == httpserver.url_for('/')
|
||||
|
||||
assert isinstance(update_event, UpdateAgentTaskEvent)
|
||||
|
||||
@@ -279,6 +279,6 @@ class TestCoreFunctionality:
|
||||
assert final_scroll_position > initial_scroll_position, 'Page did not scroll down'
|
||||
|
||||
# Verify the action was executed
|
||||
history = agent.state.history
|
||||
history = agent.history
|
||||
action_names = history.action_names()
|
||||
assert 'scroll_down' in action_names
|
||||
|
||||
Reference in New Issue
Block a user