Merge branch 'main' into local-remote-split

This commit is contained in:
Nick Sweeting
2025-08-04 19:44:05 -07:00
committed by GitHub
22 changed files with 359 additions and 232 deletions

View File

@@ -37,7 +37,7 @@ class UpdateAgentTaskEvent(BaseEvent):
if not hasattr(agent, '_task_start_time'):
raise ValueError('Agent must have _task_start_time attribute')
done_output = agent.state.history.final_result() if agent.state.history else None
done_output = agent.history.final_result() if agent.history else None
return cls(
id=str(agent.task_id),
user_id='', # To be filled by cloud handler
@@ -47,7 +47,7 @@ class UpdateAgentTaskEvent(BaseEvent):
stopped=agent.state.stopped if hasattr(agent.state, 'stopped') else False,
paused=agent.state.paused if hasattr(agent.state, 'paused') else False,
done_output=done_output,
finished_at=datetime.now(timezone.utc) if agent.state.history and agent.state.history.is_done() else None,
finished_at=datetime.now(timezone.utc) if agent.history and agent.history.is_done() else None,
agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
user_feedback_type=None,
user_comment=None,

View File

@@ -61,28 +61,22 @@ def create_history_gif(
logger.warning('No history to create GIF from')
return
# Get all screenshots from history (including None placeholders)
screenshots = history.screenshots(return_none_if_not_screenshot=True)
if not screenshots:
logger.warning('No screenshots found in history')
return
# Find the first non-placeholder screenshot
# A screenshot is considered a placeholder if:
# 1. It's the exact 4px placeholder for about:blank pages, OR
# 2. It comes from a new tab page (chrome://newtab/, about:blank, etc.)
first_real_screenshot = None
for item in history.history:
if not item.state.screenshot:
continue
# Skip exact placeholder screenshots
if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
continue
# Skip screenshots from new tab pages
from browser_use.utils import is_new_tab_page
if is_new_tab_page(item.state.url):
continue
# This is a real screenshot from actual web content
first_real_screenshot = item.state.screenshot
break
for screenshot in screenshots:
if screenshot and screenshot != PLACEHOLDER_4PX_SCREENSHOT:
first_real_screenshot = screenshot
break
if not first_real_screenshot:
logger.warning('No valid screenshots found (all are placeholders or from new tab pages)')
@@ -145,8 +139,9 @@ def create_history_gif(
# Find the first non-placeholder screenshot for the task frame
first_real_screenshot = None
for item in history.history:
if item.state.screenshot and item.state.screenshot != PLACEHOLDER_4PX_SCREENSHOT:
first_real_screenshot = item.state.screenshot
screenshot_b64 = item.state.get_screenshot()
if screenshot_b64 and screenshot_b64 != PLACEHOLDER_4PX_SCREENSHOT:
first_real_screenshot = screenshot_b64
break
if first_real_screenshot:
@@ -162,14 +157,14 @@ def create_history_gif(
else:
logger.warning('No real screenshots found for task frame, skipping task frame')
# Process each history item
for i, item in enumerate(history.history, 1):
if not item.state.screenshot:
# Process each history item with its corresponding screenshot
for i, (item, screenshot) in enumerate(zip(history.history, screenshots), 1):
if not screenshot:
continue
# Skip placeholder screenshots from about:blank pages
# These are 4x4 white PNGs encoded as a specific base64 string
if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
if screenshot == PLACEHOLDER_4PX_SCREENSHOT:
logger.debug(f'Skipping placeholder screenshot from about:blank page at step {i}')
continue
@@ -181,7 +176,7 @@ def create_history_gif(
continue
# Convert base64 screenshot to PIL Image
img_data = base64.b64decode(item.state.screenshot)
img_data = base64.b64decode(screenshot)
image = Image.open(io.BytesIO(img_data))
if show_goals and item.model_output:

View File

@@ -9,7 +9,6 @@ from browser_use.agent.message_manager.views import (
from browser_use.agent.prompts import AgentMessagePrompt
from browser_use.agent.views import (
ActionResult,
AgentHistoryList,
AgentOutput,
AgentStepInfo,
MessageManagerState,
@@ -104,10 +103,8 @@ class MessageManager:
state: MessageManagerState = MessageManagerState(),
use_thinking: bool = True,
include_attributes: list[str] | None = None,
message_context: str | None = None,
sensitive_data: dict[str, str | dict[str, str]] | None = None,
max_history_items: int | None = None,
images_per_step: int = 1,
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
include_tool_call_examples: bool = False,
):
@@ -118,7 +115,6 @@ class MessageManager:
self.sensitive_data_description = ''
self.use_thinking = use_thinking
self.max_history_items = max_history_items
self.images_per_step = images_per_step
self.vision_detail_level = vision_detail_level
self.include_tool_call_examples = include_tool_call_examples
@@ -126,7 +122,6 @@ class MessageManager:
# Store settings as direct attributes instead of in a settings object
self.include_attributes = include_attributes or []
self.message_context = message_context
self.sensitive_data = sensitive_data
self.last_input_messages = []
# Only initialize messages if state is empty
@@ -260,7 +255,6 @@ class MessageManager:
use_vision=True,
page_filtered_actions: str | None = None,
sensitive_data=None,
agent_history_list: AgentHistoryList | None = None, # Pass AgentHistoryList from agent
available_file_paths: list[str] | None = None, # Always pass current available_file_paths
) -> None:
"""Add browser state as human message"""
@@ -269,14 +263,8 @@ class MessageManager:
if sensitive_data:
self.sensitive_data_description = self._get_sensitive_data_description(browser_state_summary.url)
# Extract previous screenshots if we need more than 1 image and have agent history
# Use only the current screenshot
screenshots = []
if agent_history_list and self.images_per_step > 1:
# Get previous screenshots and filter out None values
raw_screenshots = agent_history_list.screenshots(n_last=self.images_per_step - 1, return_none_if_not_screenshot=False)
screenshots = [s for s in raw_screenshots if s is not None]
# add current screenshot to the end
if browser_state_summary.screenshot:
screenshots.append(browser_state_summary.screenshot)

View File

@@ -108,36 +108,6 @@ class AgentMessagePrompt:
self.vision_detail_level = vision_detail_level
assert self.browser_state
@observe_debug(ignore_input=True, ignore_output=True, name='_deduplicate_screenshots')
def _deduplicate_screenshots(self, screenshots: list[str]) -> list[str]:
"""
Remove consecutive duplicate screenshots, keeping only the most recent of each.
Args:
screenshots: List of base64-encoded screenshot strings in chronological order (oldest first)
Returns:
List of screenshots with consecutive duplicates removed, maintaining chronological order
"""
if not screenshots:
return []
if len(screenshots) == 1:
return screenshots
# Keep track of unique screenshots by comparing each with the next one
unique_screenshots = []
for i in range(len(screenshots)):
# Always keep the last screenshot
if i == len(screenshots) - 1:
unique_screenshots.append(screenshots[i])
# Only keep screenshot if it's different from the next one
elif screenshots[i] != screenshots[i + 1]:
unique_screenshots.append(screenshots[i])
return unique_screenshots
@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
def _get_browser_state_description(self) -> str:
elements_text = self.browser_state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
@@ -277,12 +247,9 @@ Available tabs:
# Start with text description
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]
# Deduplicate screenshots, keeping only the most recent of each unique image
unique_screenshots = self._deduplicate_screenshots(self.screenshots)
# Add screenshots with labels
for i, screenshot in enumerate(unique_screenshots):
if i == len(unique_screenshots) - 1:
for i, screenshot in enumerate(self.screenshots):
if i == len(self.screenshots) - 1:
label = 'Current screenshot:'
else:
# Use simple, accurate labeling since we don't have actual step timing info
@@ -302,6 +269,6 @@ Available tabs:
)
)
return UserMessage(content=content_parts)
return UserMessage(content=content_parts, cache=True)
return UserMessage(content=state_description)
return UserMessage(content=state_description, cache=True)

View File

@@ -3,7 +3,6 @@ import gc
import inspect
import json
import logging
import os
import re
import sys
import tempfile
@@ -163,7 +162,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
override_system_message: str | None = None,
extend_system_message: str | None = None,
validate_output: bool = False,
message_context: str | None = None,
generate_gif: bool | str = False,
available_file_paths: list[str] | None = None,
include_attributes: list[str] = DEFAULT_INCLUDE_ATTRIBUTES,
@@ -171,7 +169,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
use_thinking: bool = True,
flash_mode: bool = False,
max_history_items: int = 40,
images_per_step: int = 1,
page_extraction_llm: BaseChatModel | None = None,
planner_llm: BaseChatModel | None = None, # Deprecated
planner_interval: int = 1, # Deprecated
@@ -253,14 +250,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
override_system_message=override_system_message,
extend_system_message=extend_system_message,
validate_output=validate_output,
message_context=message_context,
generate_gif=generate_gif,
include_attributes=include_attributes,
max_actions_per_step=max_actions_per_step,
use_thinking=use_thinking,
flash_mode=flash_mode,
max_history_items=max_history_items,
images_per_step=images_per_step,
page_extraction_llm=page_extraction_llm,
planner_llm=None, # Always None now (deprecated)
planner_interval=1, # Always 1 now (deprecated)
@@ -281,8 +276,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Initialize state
self.state = injected_agent_state or AgentState()
# Initialize file system
# Initialize history
self.history = AgentHistoryList(history=[], usage=None)
# Initialize agent directory
import time
timestamp = int(time.time())
base_tmp = Path(tempfile.gettempdir())
self.agent_directory = base_tmp / f'browser_use_agent_{self.id}_{timestamp}'
# Initialize file system and screenshot service
self._set_file_system(file_system_path)
self._set_screenshot_service()
# Action setup
self._setup_action_models()
@@ -334,10 +340,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
use_thinking=self.settings.use_thinking,
# Settings that were previously in MessageManagerSettings
include_attributes=self.settings.include_attributes,
message_context=self.settings.message_context,
sensitive_data=sensitive_data,
max_history_items=self.settings.max_history_items,
images_per_step=self.settings.images_per_step,
vision_detail_level=self.settings.vision_detail_level,
include_tool_call_examples=self.settings.include_tool_call_examples,
)
@@ -562,10 +566,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.file_system = FileSystem(file_system_path)
self.file_system_path = file_system_path
else:
# create a temporary file system using agent ID
base_tmp = tempfile.gettempdir() # e.g., /tmp on Unix
self.file_system_path = os.path.join(base_tmp, f'browser_use_agent_{self.id}')
self.file_system = FileSystem(self.file_system_path)
# Use the agent directory for file system
self.file_system = FileSystem(self.agent_directory)
self.file_system_path = str(self.agent_directory)
except Exception as e:
logger.error(f'💾 Failed to initialize file system: {e}.')
raise e
@@ -575,6 +578,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
logger.info(f'💾 File system path: {self.file_system_path}')
def _set_screenshot_service(self) -> None:
"""Initialize screenshot service using agent directory"""
try:
from browser_use.screenshots.service import ScreenshotService
self.screenshot_service = ScreenshotService(self.agent_directory)
logger.info(f'📸 Screenshot service initialized in: {self.agent_directory}/screenshots')
except Exception as e:
logger.error(f'📸 Failed to initialize screenshot service: {e}.')
raise e
def save_file_system_state(self) -> None:
"""Save current file system state to agent state"""
if self.file_system:
@@ -583,9 +597,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
logger.error('💾 File system is not set up. Cannot save state.')
raise ValueError('File system is not set up. Cannot save state.')
def _set_message_context(self) -> str | None:
return self.settings.message_context
def _set_browser_use_version_and_source(self, source_override: str | None = None) -> None:
"""Get the version from pyproject.toml and determine the source of the browser-use package"""
# Use the helper function for version detection
@@ -696,20 +707,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):
assert self.browser_session is not None, 'BrowserSession is not set up'
self.logger.debug(f'🌐 Step {self.state.n_steps + 1}: Getting browser state...')
self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
browser_state_summary = await self.browser_session.get_browser_state_with_recovery(
cache_clickable_elements_hashes=True, include_screenshot=self.settings.use_vision
)
current_page = await self.browser_session.get_current_page()
# Check for new downloads after getting browser state (catches PDF auto-downloads and previous step downloads)
await self._check_and_update_downloads(f'Step {self.state.n_steps + 1}: after getting browser state')
await self._check_and_update_downloads(f'Step {self.state.n_steps}: after getting browser state')
self._log_step_context(current_page, browser_state_summary)
await self._raise_if_stopped_or_paused()
# Update action models with page-specific actions
self.logger.debug(f'📝 Step {self.state.n_steps + 1}: Updating action models...')
self.logger.debug(f'📝 Step {self.state.n_steps}: Updating action models...')
await self._update_action_models_for_page(current_page)
# Get page-specific filtered actions
@@ -720,7 +731,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
page_action_message = f'For this page, these additional actions are available:\n{page_filtered_actions}'
self._message_manager._add_message_with_type(UserMessage(content=page_action_message), 'consistent')
self.logger.debug(f'💬 Step {self.state.n_steps + 1}: Adding state message to context...')
self.logger.debug(f'💬 Step {self.state.n_steps}: Adding state message to context...')
self._message_manager.add_state_message(
browser_state_summary=browser_state_summary,
model_output=self.state.last_model_output,
@@ -729,7 +740,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
use_vision=self.settings.use_vision,
page_filtered_actions=page_filtered_actions if page_filtered_actions else None,
sensitive_data=self.sensitive_data,
agent_history_list=self.state.history, # Pass AgentHistoryList for screenshots
available_file_paths=self.available_file_paths, # Always pass current available_file_paths
)
@@ -741,7 +751,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""Execute LLM interaction with retry logic and handle callbacks"""
input_messages = self._message_manager.get_messages()
self.logger.debug(
f'🤖 Step {self.state.n_steps + 1}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
f'🤖 Step {self.state.n_steps}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
)
try:
@@ -758,9 +768,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Check again for paused/stopped state after getting model output
await self._raise_if_stopped_or_paused()
# Increment step counter at the start of each step
self.state.n_steps += 1
# Handle callbacks and conversation saving
await self._handle_post_llm_processing(browser_state_summary, input_messages)
@@ -854,7 +861,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
)
# Use _make_history_item like main branch
self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)
await self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)
# Log step completion summary
self._log_step_completion_summary(self.step_start_time, self.state.last_result)
@@ -877,6 +884,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
)
self.eventbus.dispatch(step_event)
# Increment step counter after step is fully completed
self.state.n_steps += 1
async def _handle_final_step(self, step_info: AgentStepInfo | None = None) -> None:
"""Handle special processing for the last step"""
if step_info and step_info.is_last_step():
@@ -893,7 +903,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""Get model output with retry logic for empty actions"""
model_output = await self.get_model_output(input_messages)
self.logger.debug(
f'✅ Step {self.state.n_steps + 1}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
f'✅ Step {self.state.n_steps}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
)
if (
@@ -947,7 +957,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.settings.save_conversation_path_encoding,
)
def _make_history_item(
async def _make_history_item(
self,
model_output: AgentOutput | None,
browser_state_summary: BrowserStateSummary,
@@ -961,12 +971,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
else:
interacted_elements = [None]
# Store screenshot and get path
screenshot_path = None
if browser_state_summary.screenshot:
screenshot_path = await self.screenshot_service.store_screenshot(browser_state_summary.screenshot, self.state.n_steps)
state_history = BrowserStateHistory(
url=browser_state_summary.url,
title=browser_state_summary.title,
tabs=browser_state_summary.tabs,
interacted_element=interacted_elements,
screenshot=browser_state_summary.screenshot,
screenshot_path=screenshot_path,
)
history_item = AgentHistory(
@@ -976,7 +991,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
metadata=metadata,
)
self.state.history.history.append(history_item)
self.history.add_item(history_item)
def _remove_think_tags(self, text: str) -> str:
THINK_TAGS = re.compile(r'<think>.*?</think>', re.DOTALL)
@@ -1021,7 +1036,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
url_short = current_page.url[:50] + '...' if len(current_page.url) > 50 else current_page.url
interactive_count = len(browser_state_summary.selector_map) if browser_state_summary else 0
self.logger.info(
f'📍 Step {self.state.n_steps + 1}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
f'📍 Step {self.state.n_steps}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
)
def _log_next_action_summary(self, parsed: 'AgentOutput') -> None:
@@ -1094,7 +1109,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Prepare action_history data correctly
action_history_data = []
for item in self.state.history.history:
for item in self.history.history:
if item.model_output and item.model_output.action:
# Convert each ActionModel in the step to its dictionary representation
step_actions = [
@@ -1107,7 +1122,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Append None or [] if a step had no actions or no model output
action_history_data.append(None)
final_res = self.state.history.final_result()
final_res = self.history.final_result()
final_result_str = json.dumps(final_res) if final_res is not None else None
self.telemetry.capture(
@@ -1125,13 +1140,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
cdp_url=urlparse(self.browser_session.cdp_url).hostname
if self.browser_session and self.browser_session.cdp_url
else None,
action_errors=self.state.history.errors(),
action_errors=self.history.errors(),
action_history=action_history_data,
urls_visited=self.state.history.urls(),
urls_visited=self.history.urls(),
steps=self.state.n_steps,
total_input_tokens=token_summary.prompt_tokens,
total_duration_seconds=self.state.history.total_duration_seconds(),
success=self.state.history.is_successful(),
total_duration_seconds=self.history.total_duration_seconds(),
success=self.history.is_successful(),
final_result_response=final_result_str,
error_message=agent_run_error,
)
@@ -1145,13 +1160,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""
await self.step(step_info)
if self.state.history.is_done():
if self.history.is_done():
await self.log_completion()
if self.register_done_callback:
if inspect.iscoroutinefunction(self.register_done_callback):
await self.register_done_callback(self.state.history)
await self.register_done_callback(self.history)
else:
self.register_done_callback(self.state.history)
self.register_done_callback(self.history)
return True, True
return False, False
@@ -1271,22 +1286,22 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if on_step_end is not None:
await on_step_end(self)
if self.state.history.is_done():
if self.history.is_done():
self.logger.debug(f'🎯 Task completed after {step + 1} steps!')
await self.log_completion()
if self.register_done_callback:
if inspect.iscoroutinefunction(self.register_done_callback):
await self.register_done_callback(self.state.history)
await self.register_done_callback(self.history)
else:
self.register_done_callback(self.state.history)
self.register_done_callback(self.history)
# Task completed
break
else:
agent_run_error = 'Failed to complete task in maximum steps'
self.state.history.history.append(
self.history.add_item(
AgentHistory(
model_output=None,
result=[ActionResult(error=agent_run_error, include_in_memory=True)],
@@ -1295,7 +1310,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
title='',
tabs=[],
interacted_element=[],
screenshot=None,
screenshot_path=None,
),
metadata=None,
)
@@ -1304,23 +1319,23 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.info(f'{agent_run_error}')
self.logger.debug('📊 Collecting usage summary...')
self.state.history.usage = await self.token_cost_service.get_usage_summary()
self.history.usage = await self.token_cost_service.get_usage_summary()
# set the model output schema and call it on the fly
if self.state.history._output_model_schema is None and self.output_model_schema is not None:
self.state.history._output_model_schema = self.output_model_schema
if self.history._output_model_schema is None and self.output_model_schema is not None:
self.history._output_model_schema = self.output_model_schema
self.logger.debug('🏁 Agent.run() completed successfully')
return self.state.history
return self.history
except KeyboardInterrupt:
# Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
self.logger.info('Got KeyboardInterrupt during execution, returning current history')
agent_run_error = 'KeyboardInterrupt'
self.state.history.usage = await self.token_cost_service.get_usage_summary()
self.history.usage = await self.token_cost_service.get_usage_summary()
return self.state.history
return self.history
except Exception as e:
self.logger.error(f'Agent run failed with exception: {e}', exc_info=True)
@@ -1359,7 +1374,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Lazy import gif module to avoid heavy startup cost
from browser_use.agent.gif import create_history_gif
create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
create_history_gif(task=self.task, history=self.history, output_path=output_path)
# Only emit output file event if GIF was actually created
if Path(output_path).exists():
@@ -1484,7 +1499,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
async def log_completion(self) -> None:
"""Log the completion of the task"""
if self.state.history.is_successful():
if self.history.is_successful():
self.logger.info('✅ Task completed successfully')
else:
self.logger.info('❌ Task completed without success')
@@ -1618,7 +1633,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""Save the history to a file"""
if not file_path:
file_path = 'AgentHistory.json'
self.state.history.save_to_file(file_path)
self.history.save_to_file(file_path)
async def wait_until_resumed(self):
await self._external_pause_event.wait()
@@ -1756,14 +1771,14 @@ class Agent(Generic[Context, AgentStructuredOutput]):
timestamp = datetime.now().isoformat()
# Only declare variables that are used multiple times
structured_output = self.state.history.structured_output
structured_output = self.history.structured_output
structured_output_json = json.dumps(structured_output.model_dump()) if structured_output else None
final_result = self.state.history.final_result()
final_result = self.history.final_result()
git_info = get_git_info()
action_history = self.state.history.action_history()
action_errors = self.state.history.errors()
urls = self.state.history.urls()
usage = self.state.history.usage
action_history = self.history.action_history()
action_errors = self.history.errors()
urls = self.history.urls()
usage = self.history.usage
return {
'trace': {
@@ -1790,10 +1805,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
'final_result_response_truncated': (
final_result[:20000] if final_result and len(final_result) > 20000 else final_result
),
'self_report_completed': 1 if self.state.history.is_done() else 0,
'self_report_success': 1 if self.state.history.is_successful() else 0,
'duration': self.state.history.total_duration_seconds(),
'steps_taken': self.state.history.number_of_steps(),
'self_report_completed': 1 if self.history.is_done() else 0,
'self_report_success': 1 if self.history.is_successful() else 0,
'duration': self.history.total_duration_seconds(),
'steps_taken': self.history.number_of_steps(),
'usage': json.dumps(usage.model_dump()) if usage else None,
},
'trace_details': {
@@ -1805,6 +1820,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# AgentHistoryList methods
'structured_output': structured_output_json,
'final_result_response': final_result,
'complete_history': _get_complete_history_without_screenshots(self.state.history.model_dump()),
'complete_history': _get_complete_history_without_screenshots(self.history.model_dump()),
},
}

View File

@@ -36,7 +36,6 @@ class AgentSettings(BaseModel):
max_failures: int = 3
retry_delay: int = 10
validate_output: bool = False
message_context: str | None = None
generate_gif: bool | str = False
override_system_message: str | None = None
extend_system_message: str | None = None
@@ -56,7 +55,6 @@ class AgentSettings(BaseModel):
use_thinking: bool = True
flash_mode: bool = False # If enabled, disables evaluation_previous_goal and next_goal, and sets use_thinking = False
max_history_items: int = 40
images_per_step: int = 1
page_extraction_llm: BaseChatModel | None = None
planner_llm: BaseChatModel | None = None
@@ -76,7 +74,6 @@ class AgentState(BaseModel):
n_steps: int = 1
consecutive_failures: int = 0
last_result: list[ActionResult] | None = None
history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[], usage=None))
last_plan: str | None = None
last_model_output: AgentOutput | None = None
paused: bool = False
@@ -329,6 +326,10 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
"""Representation of the AgentHistoryList object"""
return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})'
def add_item(self, history_item: AgentHistory) -> None:
"""Add a history item to the list"""
self.history.append(history_item)
def __repr__(self) -> str:
"""Representation of the AgentHistoryList object"""
return self.__str__()
@@ -443,20 +444,39 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
"""Get all unique URLs from history"""
return [h.state.url if h.state.url is not None else None for h in self.history]
def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
"""Get all screenshots from history"""
def screenshot_paths(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
"""Get all screenshot paths from history"""
if n_last == 0:
return []
if n_last is None:
if return_none_if_not_screenshot:
return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history]
return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history]
else:
return [h.state.screenshot for h in self.history if h.state.screenshot is not None]
return [h.state.screenshot_path for h in self.history if h.state.screenshot_path is not None]
else:
if return_none_if_not_screenshot:
return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history[-n_last:]]
return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history[-n_last:]]
else:
return [h.state.screenshot for h in self.history[-n_last:] if h.state.screenshot is not None]
return [h.state.screenshot_path for h in self.history[-n_last:] if h.state.screenshot_path is not None]
def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
"""Get all screenshots from history as base64 strings"""
if n_last == 0:
return []
history_items = self.history if n_last is None else self.history[-n_last:]
screenshots = []
for item in history_items:
screenshot_b64 = item.state.get_screenshot()
if screenshot_b64:
screenshots.append(screenshot_b64)
else:
if return_none_if_not_screenshot:
screenshots.append(None)
# If return_none_if_not_screenshot is False, we skip None values
return screenshots
def action_names(self) -> list[str]:
"""Get all action names from history"""

View File

@@ -81,12 +81,31 @@ class BrowserStateHistory:
title: str
tabs: list[TabInfo]
interacted_element: list[DOMHistoryElement | None] | list[None]
screenshot: str | None = None
screenshot_path: str | None = None
def get_screenshot(self) -> str | None:
"""Load screenshot from disk and return as base64 string"""
if not self.screenshot_path:
return None
import base64
from pathlib import Path
path_obj = Path(self.screenshot_path)
if not path_obj.exists():
return None
try:
with open(path_obj, 'rb') as f:
screenshot_data = f.read()
return base64.b64encode(screenshot_data).decode('utf-8')
except Exception:
return None
def to_dict(self) -> dict[str, Any]:
data = {}
data['tabs'] = [tab.model_dump() for tab in self.tabs]
data['screenshot'] = self.screenshot
data['screenshot_path'] = self.screenshot_path
data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
data['url'] = self.url
data['title'] = self.title

View File

@@ -815,18 +815,18 @@ class BrowserUseApp(App):
# Show token usage statistics if agent exists and has history
if self.agent and hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
# Get total tokens used
# total_tokens = self.agent.state.history.total_input_tokens()
# total_tokens = self.agent.history.total_input_tokens()
# model_info.write(f'[white]Input tokens:[/] [green]{total_tokens:,}[/]')
# Calculate tokens per step
num_steps = len(self.agent.state.history.history)
num_steps = len(self.agent.history.history)
# if num_steps > 0:
# avg_tokens_per_step = total_tokens / num_steps
# model_info.write(f'[white]Avg tokens/step:[/] [green]{avg_tokens_per_step:,.1f}[/]')
# Get the last step metadata to show the most recent LLM response time
if num_steps > 0 and self.agent.state.history.history[-1].metadata:
last_step = self.agent.state.history.history[-1]
if num_steps > 0 and self.agent.history.history[-1].metadata:
last_step = self.agent.history.history[-1]
if last_step.metadata:
step_duration = last_step.metadata.duration_seconds
else:
@@ -838,7 +838,7 @@ class BrowserUseApp(App):
# model_info.write(f'[white]Avg tokens/sec:[/] [magenta]{tokens_per_second:.1f}[/]')
# Show total duration
total_duration = self.agent.state.history.total_duration_seconds()
total_duration = self.agent.history.total_duration_seconds()
if total_duration > 0:
model_info.write(f'[white]Total Duration:[/] [magenta]{total_duration:.2f}s[/]')
@@ -891,7 +891,7 @@ class BrowserUseApp(App):
# Get all agent history items
history_items = []
if hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
history_items = self.agent.state.history.history
history_items = self.agent.history.history
if history_items:
tasks_info.write('[bold yellow]STEPS:[/]')

View File

@@ -371,6 +371,7 @@ class ChatGoogle(BaseChatModel):
key == 'properties'
and isinstance(cleaned_value, dict)
and len(cleaned_value) == 0
and isinstance(obj.get('type', ''), str)
and obj.get('type', '').upper() == 'OBJECT'
):
# Convert empty object to have at least one property
@@ -380,7 +381,8 @@ class ChatGoogle(BaseChatModel):
# If this is an object type with empty properties, add a placeholder
if (
cleaned.get('type', '').upper() == 'OBJECT'
isinstance(cleaned.get('type', ''), str)
and cleaned.get('type', '').upper() == 'OBJECT'
and 'properties' in cleaned
and isinstance(cleaned['properties'], dict)
and len(cleaned['properties']) == 0

View File

@@ -0,0 +1 @@
# Screenshots package for browser-use

View File

@@ -0,0 +1,48 @@
"""
Screenshot storage service for browser-use agents.
"""
import base64
from pathlib import Path
import anyio
class ScreenshotService:
"""Simple screenshot storage service that saves screenshots to disk"""
def __init__(self, agent_directory: str | Path):
"""Initialize with agent directory path"""
self.agent_directory = Path(agent_directory) if isinstance(agent_directory, str) else agent_directory
# Create screenshots subdirectory
self.screenshots_dir = self.agent_directory / 'screenshots'
self.screenshots_dir.mkdir(parents=True, exist_ok=True)
async def store_screenshot(self, screenshot_b64: str, step_number: int) -> str:
"""Store screenshot to disk and return the full path as string"""
screenshot_filename = f'step_{step_number}.png'
screenshot_path = self.screenshots_dir / screenshot_filename
# Decode base64 and save to disk
screenshot_data = base64.b64decode(screenshot_b64)
async with await anyio.open_file(screenshot_path, 'wb') as f:
await f.write(screenshot_data)
return str(screenshot_path)
async def get_screenshot(self, screenshot_path: str) -> str | None:
"""Load screenshot from disk path and return as base64"""
if not screenshot_path:
return None
path = Path(screenshot_path)
if not path.exists():
return None
# Load from disk and encode to base64
async with await anyio.open_file(path, 'rb') as f:
screenshot_data = await f.read()
return base64.b64encode(screenshot_data).decode('utf-8')

View File

@@ -133,7 +133,7 @@ history = await agent.run()
# Access (some) useful information
history.urls() # List of visited URLs
history.screenshots() # List of screenshot paths
history.screenshot_paths() # List of screenshot paths
history.action_names() # Names of executed actions
history.extracted_content() # Content extracted during execution
history.errors() # Any errors that occurred
@@ -173,60 +173,12 @@ agent = Agent(
)
```
## Run with message context
You can configure the agent and provide a separate message to help the LLM understand the task better.
```python
from browser_use.llm import ChatOpenAI
agent = Agent(
task="your task",
message_context="Additional information about the task",
llm = ChatOpenAI(model='gpt-4o')
)
```
## Run with planner model
You can configure the agent to use a separate planner model for high-level task planning:
```python
from browser_use.llm import ChatOpenAI
# Initialize models
llm = ChatOpenAI(model='gpt-4o')
planner_llm = ChatOpenAI(model='o3-mini')
agent = Agent(
task="your task",
llm=llm,
planner_llm=planner_llm, # Separate model for planning
use_vision_for_planner=False, # Disable vision for planner
planner_interval=4 # Plan every 4 steps
)
```
### Planner Parameters
- `planner_llm`: A chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM.
- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`.
- `planner_interval`: Number of steps between planning phases. Defaults to `1`.
Using a separate planner model can help:
- Reduce costs by using a smaller model for high-level planning
- Improve task decomposition and strategic thinking
- Better handle complex, multi-step tasks
<Note>
The planner model is optional. If not specified, the agent will not use the
planner model.
</Note>
### Optional Parameters
- `message_context`: Additional information about the task to help the LLM understand the task better.
- `initial_actions`: List of initial actions to run before the main task.
- `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`.
- `max_failures`: Maximum number of failures before giving up. Defaults to `3`.

View File

@@ -41,7 +41,7 @@ async def my_step_hook(agent: Agent):
# https://playwright.dev/python/docs/api/class-page
current_url = page.url
visit_log = agent.state.history.urls()
visit_log = agent.history.urls()
previous_url = visit_log[-2] if len(visit_log) >= 2 else None
print(f"Agent was last on URL: {previous_url} and is now on {current_url}")
@@ -91,11 +91,12 @@ When working with agent hooks, you have access to the entire `Agent` instance. H
- `agent.settings` contains all the configuration options passed to the `Agent(...)` at init time
- `agent.llm` gives direct access to the main LLM object (e.g. `ChatOpenAI`)
- `agent.state` gives access to lots of internal state, including agent thoughts, outputs, actions, etc.
- `agent.state.history.model_thoughts()`: Reasoning from Browser Use's model.
- `agent.state.history.model_outputs()`: Raw outputs from the Browsre Use's model.
- `agent.state.history.model_actions()`: Actions taken by the agent
- `agent.state.history.extracted_content()`: Content extracted from web pages
- `agent.state.history.urls()`: URLs visited by the agent
- `agent.history` gives access to historical data from the agent's execution:
- `agent.history.model_thoughts()`: Reasoning from Browser Use's model.
- `agent.history.model_outputs()`: Raw outputs from the Browser Use's model.
- `agent.history.model_actions()`: Actions taken by the agent
- `agent.history.extracted_content()`: Content extracted from web pages
- `agent.history.urls()`: URLs visited by the agent
- `agent.browser_session` gives direct access to the `BrowserSession()` and playwright objects
- `agent.browser_session.get_current_page()`: Get the current playwright `Page` object the agent is focused on
- `agent.browser_session.browser_context`: Get the current playwright `BrowserContext` object

View File

@@ -154,7 +154,7 @@ async def record_activity(agent_obj):
print('--> History:')
# Assert agent has state to satisfy type checker
assert hasattr(agent_obj, 'state'), 'Agent must have state attribute'
history = agent_obj.state.history
history = agent_obj.history
model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)
@@ -164,7 +164,7 @@ async def record_activity(agent_obj):
# prettyprinter.cpprint(model_thoughts_last_elem)
# print("--- MODEL OUTPUT ACTION ---")
model_outputs = agent_obj.state.history.model_outputs()
model_outputs = agent_obj.history.model_outputs()
model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False)
if len(model_outputs_json) > 0:
@@ -172,7 +172,7 @@ async def record_activity(agent_obj):
# prettyprinter.cpprint(model_outputs_json_last_elem)
# print("--- MODEL INTERACTED ELEM ---")
model_actions = agent_obj.state.history.model_actions()
model_actions = agent_obj.history.model_actions()
model_actions_json = obj_to_json(obj=model_actions, check_circular=False)
if len(model_actions_json) > 0:
@@ -180,14 +180,14 @@ async def record_activity(agent_obj):
# prettyprinter.cpprint(model_actions_json_last_elem)
# print("--- EXTRACTED CONTENT ---")
extracted_content = agent_obj.state.history.extracted_content()
extracted_content = agent_obj.history.extracted_content()
extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False)
if len(extracted_content_json) > 0:
extracted_content_json_last_elem = extracted_content_json[-1]
# prettyprinter.cpprint(extracted_content_json_last_elem)
# print("--- URLS ---")
urls = agent_obj.state.history.urls()
urls = agent_obj.history.urls()
# prettyprinter.cpprint(urls)
urls_json = obj_to_json(obj=urls, check_circular=False)

View File

@@ -47,8 +47,6 @@ async def main():
if done and valid:
break
agent_state.history.history = []
# Save state to file
async with await anyio.open_file('agent_state.json', 'w') as f:
serialized = agent_state.model_dump_json(exclude={'history'})

View File

@@ -28,7 +28,7 @@ async def main():
task='What should we pay attention to in the recent new rules on tariffs in China-US trade?',
llm=llm,
use_vision=False,
message_context=extend_system_message,
extend_system_message=extend_system_message,
)
await agent.run()

View File

@@ -2,7 +2,7 @@
name = "browser-use"
description = "Make websites accessible for AI agents"
authors = [{ name = "Gregor Zunic" }]
version = "0.5.7"
version = "0.5.9"
readme = "README.md"
requires-python = ">=3.11,<4.0"
classifiers = [

View File

@@ -0,0 +1,107 @@
"""
Test to reproduce and verify fix for GitHub issue #2470:
"Python field with name 'type' handled differently between Gemini and OpenAI GPT"
"""
from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.schema import SchemaOptimizer
class TestGeminiTypeFieldHandling:
"""Test class for reproducing the type field issue with Gemini schema processing."""
def test_gemini_schema_with_dict_type_field(self):
"""
Test that Gemini schema processing handles dict 'type' field gracefully.
Reproduces the AttributeError: 'dict' object has no attribute 'upper'
"""
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
# Schema with dict instead of string in type field
problematic_schema = {'type': {'malformed': 'dict_type'}, 'properties': {}}
result = chat_google._fix_gemini_schema(problematic_schema)
assert result is not None
assert isinstance(result, dict)
assert result['type'] == {'malformed': 'dict_type'}
def test_gemini_schema_with_nested_dict_type_field(self):
"""
Test that nested dict 'type' fields are handled gracefully.
"""
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
# Schema with nested dict type field
problematic_schema = {
'type': 'object',
'properties': {'nested_field': {'type': {'malformed': 'dict_instead_of_string'}, 'properties': {}}},
}
result = chat_google._fix_gemini_schema(problematic_schema)
assert result is not None
assert isinstance(result, dict)
nested_type = result['properties']['nested_field']['type']
assert nested_type == {'malformed': 'dict_instead_of_string'}
def test_gemini_schema_with_none_type_field(self):
"""Test handling of None type field."""
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
problematic_schema = {'type': 'object', 'properties': {'nested_field': {'type': None, 'properties': {}}}}
result = chat_google._fix_gemini_schema(problematic_schema)
assert result is not None
def test_gemini_schema_with_valid_string_type(self):
"""Test that valid string type fields work correctly."""
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
valid_schema = {'type': 'object', 'properties': {'nested_field': {'type': 'object', 'properties': {}}}}
# Should work without issues
result = chat_google._fix_gemini_schema(valid_schema)
assert result is not None
assert isinstance(result, dict)
def test_gemini_schema_with_empty_properties_object(self):
"""Test handling of empty properties in object type."""
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
schema_with_empty_props = {
'type': 'object',
'properties': {
'empty_object': {
'type': 'object',
'properties': {}, # Empty properties should get placeholder
}
},
}
result = chat_google._fix_gemini_schema(schema_with_empty_props)
nested_props = result['properties']['empty_object']['properties']
assert '_placeholder' in nested_props
assert nested_props['_placeholder']['type'] == 'string'
def test_consistency_between_providers(self):
"""
Test that both Gemini and OpenAI handle schemas consistently.
The original issue was that Gemini would fail where OpenAI succeeded.
"""
from pydantic import BaseModel, Field
# Create a test model that generates a schema with dict type
class TestModel(BaseModel):
field_with_dict_type: dict = Field(default_factory=dict)
# OpenAI uses SchemaOptimizer directly
openai_schema = SchemaOptimizer.create_optimized_json_schema(TestModel)
assert openai_schema is not None
# Gemini processes the schema through _fix_gemini_schema
chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
gemini_result = chat_google._fix_gemini_schema(openai_schema)
assert gemini_result is not None
# Both should handle the schema without errors
# This demonstrates that the fix makes Gemini consistent with OpenAI

View File

@@ -9,7 +9,7 @@ from PIL import Image
from browser_use import AgentHistoryList
from browser_use.agent.gif import create_history_gif
from browser_use.agent.views import ActionResult, AgentHistory, AgentOutput
from browser_use.browser.views import PLACEHOLDER_4PX_SCREENSHOT, BrowserStateHistory, TabInfo
from browser_use.browser.views import BrowserStateHistory, TabInfo
@pytest.fixture
@@ -49,9 +49,22 @@ def create_test_screenshot(width: int = 800, height: int = 600, color: tuple = (
async def test_gif_filters_out_placeholder_screenshots(test_dir):
"""Test that 4px placeholder screenshots from about:blank pages are filtered out of GIFs."""
# Set up screenshot service for testing (still needed to create test files)
from browser_use.screenshots.service import ScreenshotService
screenshot_service = ScreenshotService(test_dir)
# Helper function to store test screenshots
async def store_test_screenshot(screenshot_b64: str, step: int) -> str:
return await screenshot_service.store_screenshot(screenshot_b64, step)
# Create a history with mixed screenshots: real and placeholder
history_items = []
# Store test screenshots
real_screenshot_1_path = await store_test_screenshot(create_test_screenshot(800, 600, (100, 150, 200)), 2)
real_screenshot_2_path = await store_test_screenshot(create_test_screenshot(800, 600, (200, 100, 50)), 4)
# First item: about:blank placeholder (should be filtered)
history_items.append(
AgentHistory(
@@ -63,7 +76,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
screenshot=PLACEHOLDER_4PX_SCREENSHOT,
screenshot_path=None, # Placeholder doesn't have a file path
url='about:blank',
title='New Tab',
tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],
@@ -83,7 +96,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
screenshot=create_test_screenshot(800, 600, (100, 150, 200)),
screenshot_path=real_screenshot_1_path,
url='https://example.com',
title='Example',
tabs=[TabInfo(page_id=1, url='https://example.com', title='Example')],
@@ -103,7 +116,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
screenshot=PLACEHOLDER_4PX_SCREENSHOT,
screenshot_path=None, # Placeholder doesn't have a file path
url='about:blank',
title='New Tab',
tabs=[TabInfo(page_id=2, url='about:blank', title='New Tab')],
@@ -123,7 +136,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
screenshot=create_test_screenshot(800, 600, (200, 100, 50)),
screenshot_path=real_screenshot_2_path,
url='https://example.com/page2',
title='Page 2',
tabs=[TabInfo(page_id=1, url='https://example.com/page2', title='Page 2')],
@@ -190,7 +203,7 @@ async def test_gif_handles_all_placeholders(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
screenshot=PLACEHOLDER_4PX_SCREENSHOT,
screenshot_path=None, # Placeholder doesn't have a file path
url='about:blank',
title='New Tab',
tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],

View File

@@ -85,9 +85,10 @@ async def test_gif_generation_with_real_navigation(httpserver, tmp_path):
# Verify history contains real screenshots (not placeholders)
has_real_screenshot = False
for item in history.history:
screenshot_b64 = item.state.get_screenshot()
if (
item.state.screenshot
and item.state.screenshot
screenshot_b64
and screenshot_b64
!= 'iVBORw0KGgoAAAANSUhEUgAAAAQAAAAECAIAAAAmkwkpAAAAFElEQVR4nGP8//8/AwwwMSAB3BwAlm4DBfIlvvkAAAAASUVORK5CYII='
):
has_real_screenshot = True

View File

@@ -88,7 +88,7 @@ class TestAgentEventLifecycle:
assert isinstance(step_event, CreateAgentStepEvent)
assert step_event.agent_task_id == task_event.id
assert step_event.step == 2 # Step is incremented before event is emitted
assert step_event.step == 1 # Step is incremented before event is emitted
assert step_event.url == httpserver.url_for('/')
assert isinstance(update_event, UpdateAgentTaskEvent)

View File

@@ -279,6 +279,6 @@ class TestCoreFunctionality:
assert final_scroll_position > initial_scroll_position, 'Page did not scroll down'
# Verify the action was executed
history = agent.state.history
history = agent.history
action_names = history.action_names()
assert 'scroll_down' in action_names