diff --git a/browser_use/agent/cloud_events.py b/browser_use/agent/cloud_events.py
index adca25a5f..7c9060e67 100644
--- a/browser_use/agent/cloud_events.py
+++ b/browser_use/agent/cloud_events.py
@@ -37,7 +37,7 @@ class UpdateAgentTaskEvent(BaseEvent):
if not hasattr(agent, '_task_start_time'):
raise ValueError('Agent must have _task_start_time attribute')
- done_output = agent.state.history.final_result() if agent.state.history else None
+ done_output = agent.history.final_result() if agent.history else None
return cls(
id=str(agent.task_id),
user_id='', # To be filled by cloud handler
@@ -47,7 +47,7 @@ class UpdateAgentTaskEvent(BaseEvent):
stopped=agent.state.stopped if hasattr(agent.state, 'stopped') else False,
paused=agent.state.paused if hasattr(agent.state, 'paused') else False,
done_output=done_output,
- finished_at=datetime.now(timezone.utc) if agent.state.history and agent.state.history.is_done() else None,
+ finished_at=datetime.now(timezone.utc) if agent.history and agent.history.is_done() else None,
agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
user_feedback_type=None,
user_comment=None,
diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py
index 9d5798fdd..31826efb6 100644
--- a/browser_use/agent/gif.py
+++ b/browser_use/agent/gif.py
@@ -61,28 +61,22 @@ def create_history_gif(
logger.warning('No history to create GIF from')
return
+ # Get all screenshots from history (including None placeholders)
+ screenshots = history.screenshots(return_none_if_not_screenshot=True)
+
+ if not screenshots:
+ logger.warning('No screenshots found in history')
+ return
+
# Find the first non-placeholder screenshot
# A screenshot is considered a placeholder if:
# 1. It's the exact 4px placeholder for about:blank pages, OR
# 2. It comes from a new tab page (chrome://newtab/, about:blank, etc.)
first_real_screenshot = None
- for item in history.history:
- if not item.state.screenshot:
- continue
-
- # Skip exact placeholder screenshots
- if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
- continue
-
- # Skip screenshots from new tab pages
- from browser_use.utils import is_new_tab_page
-
- if is_new_tab_page(item.state.url):
- continue
-
- # This is a real screenshot from actual web content
- first_real_screenshot = item.state.screenshot
- break
+ for screenshot in screenshots:
+ if screenshot and screenshot != PLACEHOLDER_4PX_SCREENSHOT:
+ first_real_screenshot = screenshot
+ break
if not first_real_screenshot:
logger.warning('No valid screenshots found (all are placeholders or from new tab pages)')
@@ -145,8 +139,9 @@ def create_history_gif(
# Find the first non-placeholder screenshot for the task frame
first_real_screenshot = None
for item in history.history:
- if item.state.screenshot and item.state.screenshot != PLACEHOLDER_4PX_SCREENSHOT:
- first_real_screenshot = item.state.screenshot
+ screenshot_b64 = item.state.get_screenshot()
+ if screenshot_b64 and screenshot_b64 != PLACEHOLDER_4PX_SCREENSHOT:
+ first_real_screenshot = screenshot_b64
break
if first_real_screenshot:
@@ -162,14 +157,14 @@ def create_history_gif(
else:
logger.warning('No real screenshots found for task frame, skipping task frame')
- # Process each history item
- for i, item in enumerate(history.history, 1):
- if not item.state.screenshot:
+ # Process each history item with its corresponding screenshot
+ for i, (item, screenshot) in enumerate(zip(history.history, screenshots), 1):
+ if not screenshot:
continue
# Skip placeholder screenshots from about:blank pages
# These are 4x4 white PNGs encoded as a specific base64 string
- if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
+ if screenshot == PLACEHOLDER_4PX_SCREENSHOT:
logger.debug(f'Skipping placeholder screenshot from about:blank page at step {i}')
continue
@@ -181,7 +176,7 @@ def create_history_gif(
continue
# Convert base64 screenshot to PIL Image
- img_data = base64.b64decode(item.state.screenshot)
+ img_data = base64.b64decode(screenshot)
image = Image.open(io.BytesIO(img_data))
if show_goals and item.model_output:
diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
index 1aa96ce86..8126b1c4f 100644
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -9,7 +9,6 @@ from browser_use.agent.message_manager.views import (
from browser_use.agent.prompts import AgentMessagePrompt
from browser_use.agent.views import (
ActionResult,
- AgentHistoryList,
AgentOutput,
AgentStepInfo,
MessageManagerState,
@@ -104,10 +103,8 @@ class MessageManager:
state: MessageManagerState = MessageManagerState(),
use_thinking: bool = True,
include_attributes: list[str] | None = None,
- message_context: str | None = None,
sensitive_data: dict[str, str | dict[str, str]] | None = None,
max_history_items: int | None = None,
- images_per_step: int = 1,
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
include_tool_call_examples: bool = False,
):
@@ -118,7 +115,6 @@ class MessageManager:
self.sensitive_data_description = ''
self.use_thinking = use_thinking
self.max_history_items = max_history_items
- self.images_per_step = images_per_step
self.vision_detail_level = vision_detail_level
self.include_tool_call_examples = include_tool_call_examples
@@ -126,7 +122,6 @@ class MessageManager:
# Store settings as direct attributes instead of in a settings object
self.include_attributes = include_attributes or []
- self.message_context = message_context
self.sensitive_data = sensitive_data
self.last_input_messages = []
# Only initialize messages if state is empty
@@ -260,7 +255,6 @@ class MessageManager:
use_vision=True,
page_filtered_actions: str | None = None,
sensitive_data=None,
- agent_history_list: AgentHistoryList | None = None, # Pass AgentHistoryList from agent
available_file_paths: list[str] | None = None, # Always pass current available_file_paths
) -> None:
"""Add browser state as human message"""
@@ -269,14 +263,8 @@ class MessageManager:
if sensitive_data:
self.sensitive_data_description = self._get_sensitive_data_description(browser_state_summary.url)
- # Extract previous screenshots if we need more than 1 image and have agent history
+ # Use only the current screenshot
screenshots = []
- if agent_history_list and self.images_per_step > 1:
- # Get previous screenshots and filter out None values
- raw_screenshots = agent_history_list.screenshots(n_last=self.images_per_step - 1, return_none_if_not_screenshot=False)
- screenshots = [s for s in raw_screenshots if s is not None]
-
- # add current screenshot to the end
if browser_state_summary.screenshot:
screenshots.append(browser_state_summary.screenshot)
diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py
index be646b576..e31ffd045 100644
--- a/browser_use/agent/prompts.py
+++ b/browser_use/agent/prompts.py
@@ -108,36 +108,6 @@ class AgentMessagePrompt:
self.vision_detail_level = vision_detail_level
assert self.browser_state
- @observe_debug(ignore_input=True, ignore_output=True, name='_deduplicate_screenshots')
- def _deduplicate_screenshots(self, screenshots: list[str]) -> list[str]:
- """
- Remove consecutive duplicate screenshots, keeping only the most recent of each.
-
- Args:
- screenshots: List of base64-encoded screenshot strings in chronological order (oldest first)
-
- Returns:
- List of screenshots with consecutive duplicates removed, maintaining chronological order
- """
- if not screenshots:
- return []
-
- if len(screenshots) == 1:
- return screenshots
-
- # Keep track of unique screenshots by comparing each with the next one
- unique_screenshots = []
-
- for i in range(len(screenshots)):
- # Always keep the last screenshot
- if i == len(screenshots) - 1:
- unique_screenshots.append(screenshots[i])
- # Only keep screenshot if it's different from the next one
- elif screenshots[i] != screenshots[i + 1]:
- unique_screenshots.append(screenshots[i])
-
- return unique_screenshots
-
@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
def _get_browser_state_description(self) -> str:
elements_text = self.browser_state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
@@ -277,12 +247,9 @@ Available tabs:
# Start with text description
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]
- # Deduplicate screenshots, keeping only the most recent of each unique image
- unique_screenshots = self._deduplicate_screenshots(self.screenshots)
-
# Add screenshots with labels
- for i, screenshot in enumerate(unique_screenshots):
- if i == len(unique_screenshots) - 1:
+ for i, screenshot in enumerate(self.screenshots):
+ if i == len(self.screenshots) - 1:
label = 'Current screenshot:'
else:
# Use simple, accurate labeling since we don't have actual step timing info
@@ -302,6 +269,6 @@ Available tabs:
)
)
- return UserMessage(content=content_parts)
+ return UserMessage(content=content_parts, cache=True)
- return UserMessage(content=state_description)
+ return UserMessage(content=state_description, cache=True)
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
index eb5a418c1..540079040 100644
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -3,7 +3,6 @@ import gc
import inspect
import json
import logging
-import os
import re
import sys
import tempfile
@@ -163,7 +162,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
override_system_message: str | None = None,
extend_system_message: str | None = None,
validate_output: bool = False,
- message_context: str | None = None,
generate_gif: bool | str = False,
available_file_paths: list[str] | None = None,
include_attributes: list[str] = DEFAULT_INCLUDE_ATTRIBUTES,
@@ -171,7 +169,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
use_thinking: bool = True,
flash_mode: bool = False,
max_history_items: int = 40,
- images_per_step: int = 1,
page_extraction_llm: BaseChatModel | None = None,
planner_llm: BaseChatModel | None = None, # Deprecated
planner_interval: int = 1, # Deprecated
@@ -253,14 +250,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
override_system_message=override_system_message,
extend_system_message=extend_system_message,
validate_output=validate_output,
- message_context=message_context,
generate_gif=generate_gif,
include_attributes=include_attributes,
max_actions_per_step=max_actions_per_step,
use_thinking=use_thinking,
flash_mode=flash_mode,
max_history_items=max_history_items,
- images_per_step=images_per_step,
page_extraction_llm=page_extraction_llm,
planner_llm=None, # Always None now (deprecated)
planner_interval=1, # Always 1 now (deprecated)
@@ -281,8 +276,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Initialize state
self.state = injected_agent_state or AgentState()
- # Initialize file system
+ # Initialize history
+ self.history = AgentHistoryList(history=[], usage=None)
+
+ # Initialize agent directory
+ import time
+
+ timestamp = int(time.time())
+ base_tmp = Path(tempfile.gettempdir())
+ self.agent_directory = base_tmp / f'browser_use_agent_{self.id}_{timestamp}'
+
+ # Initialize file system and screenshot service
self._set_file_system(file_system_path)
+ self._set_screenshot_service()
# Action setup
self._setup_action_models()
@@ -334,10 +340,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
use_thinking=self.settings.use_thinking,
# Settings that were previously in MessageManagerSettings
include_attributes=self.settings.include_attributes,
- message_context=self.settings.message_context,
sensitive_data=sensitive_data,
max_history_items=self.settings.max_history_items,
- images_per_step=self.settings.images_per_step,
vision_detail_level=self.settings.vision_detail_level,
include_tool_call_examples=self.settings.include_tool_call_examples,
)
@@ -562,10 +566,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.file_system = FileSystem(file_system_path)
self.file_system_path = file_system_path
else:
- # create a temporary file system using agent ID
- base_tmp = tempfile.gettempdir() # e.g., /tmp on Unix
- self.file_system_path = os.path.join(base_tmp, f'browser_use_agent_{self.id}')
- self.file_system = FileSystem(self.file_system_path)
+ # Use the agent directory for file system
+ self.file_system = FileSystem(self.agent_directory)
+ self.file_system_path = str(self.agent_directory)
except Exception as e:
logger.error(f'💾 Failed to initialize file system: {e}.')
raise e
@@ -575,6 +578,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
logger.info(f'💾 File system path: {self.file_system_path}')
+ def _set_screenshot_service(self) -> None:
+ """Initialize screenshot service using agent directory"""
+ try:
+ from browser_use.screenshots.service import ScreenshotService
+
+ self.screenshot_service = ScreenshotService(self.agent_directory)
+ logger.info(f'📸 Screenshot service initialized in: {self.agent_directory}/screenshots')
+ except Exception as e:
+ logger.error(f'📸 Failed to initialize screenshot service: {e}.')
+ raise e
+
def save_file_system_state(self) -> None:
"""Save current file system state to agent state"""
if self.file_system:
@@ -583,9 +597,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
logger.error('💾 File system is not set up. Cannot save state.')
raise ValueError('File system is not set up. Cannot save state.')
- def _set_message_context(self) -> str | None:
- return self.settings.message_context
-
def _set_browser_use_version_and_source(self, source_override: str | None = None) -> None:
"""Get the version from pyproject.toml and determine the source of the browser-use package"""
# Use the helper function for version detection
@@ -696,20 +707,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):
assert self.browser_session is not None, 'BrowserSession is not set up'
- self.logger.debug(f'🌐 Step {self.state.n_steps + 1}: Getting browser state...')
+ self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
browser_state_summary = await self.browser_session.get_browser_state_with_recovery(
cache_clickable_elements_hashes=True, include_screenshot=self.settings.use_vision
)
current_page = await self.browser_session.get_current_page()
# Check for new downloads after getting browser state (catches PDF auto-downloads and previous step downloads)
- await self._check_and_update_downloads(f'Step {self.state.n_steps + 1}: after getting browser state')
+ await self._check_and_update_downloads(f'Step {self.state.n_steps}: after getting browser state')
self._log_step_context(current_page, browser_state_summary)
await self._raise_if_stopped_or_paused()
# Update action models with page-specific actions
- self.logger.debug(f'📝 Step {self.state.n_steps + 1}: Updating action models...')
+ self.logger.debug(f'📝 Step {self.state.n_steps}: Updating action models...')
await self._update_action_models_for_page(current_page)
# Get page-specific filtered actions
@@ -720,7 +731,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
page_action_message = f'For this page, these additional actions are available:\n{page_filtered_actions}'
self._message_manager._add_message_with_type(UserMessage(content=page_action_message), 'consistent')
- self.logger.debug(f'💬 Step {self.state.n_steps + 1}: Adding state message to context...')
+ self.logger.debug(f'💬 Step {self.state.n_steps}: Adding state message to context...')
self._message_manager.add_state_message(
browser_state_summary=browser_state_summary,
model_output=self.state.last_model_output,
@@ -729,7 +740,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
use_vision=self.settings.use_vision,
page_filtered_actions=page_filtered_actions if page_filtered_actions else None,
sensitive_data=self.sensitive_data,
- agent_history_list=self.state.history, # Pass AgentHistoryList for screenshots
available_file_paths=self.available_file_paths, # Always pass current available_file_paths
)
@@ -741,7 +751,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""Execute LLM interaction with retry logic and handle callbacks"""
input_messages = self._message_manager.get_messages()
self.logger.debug(
- f'🤖 Step {self.state.n_steps + 1}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
+ f'🤖 Step {self.state.n_steps}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
)
try:
@@ -758,9 +768,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Check again for paused/stopped state after getting model output
await self._raise_if_stopped_or_paused()
- # Increment step counter at the start of each step
- self.state.n_steps += 1
-
# Handle callbacks and conversation saving
await self._handle_post_llm_processing(browser_state_summary, input_messages)
@@ -854,7 +861,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
)
# Use _make_history_item like main branch
- self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)
+ await self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)
# Log step completion summary
self._log_step_completion_summary(self.step_start_time, self.state.last_result)
@@ -877,6 +884,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
)
self.eventbus.dispatch(step_event)
+ # Increment step counter after step is fully completed
+ self.state.n_steps += 1
+
async def _handle_final_step(self, step_info: AgentStepInfo | None = None) -> None:
"""Handle special processing for the last step"""
if step_info and step_info.is_last_step():
@@ -893,7 +903,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""Get model output with retry logic for empty actions"""
model_output = await self.get_model_output(input_messages)
self.logger.debug(
- f'✅ Step {self.state.n_steps + 1}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
+ f'✅ Step {self.state.n_steps}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
)
if (
@@ -947,7 +957,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.settings.save_conversation_path_encoding,
)
- def _make_history_item(
+ async def _make_history_item(
self,
model_output: AgentOutput | None,
browser_state_summary: BrowserStateSummary,
@@ -961,12 +971,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
else:
interacted_elements = [None]
+ # Store screenshot and get path
+ screenshot_path = None
+ if browser_state_summary.screenshot:
+ screenshot_path = await self.screenshot_service.store_screenshot(browser_state_summary.screenshot, self.state.n_steps)
+
state_history = BrowserStateHistory(
url=browser_state_summary.url,
title=browser_state_summary.title,
tabs=browser_state_summary.tabs,
interacted_element=interacted_elements,
- screenshot=browser_state_summary.screenshot,
+ screenshot_path=screenshot_path,
)
history_item = AgentHistory(
@@ -976,7 +991,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
metadata=metadata,
)
- self.state.history.history.append(history_item)
+ self.history.add_item(history_item)
def _remove_think_tags(self, text: str) -> str:
THINK_TAGS = re.compile(r'.*?', re.DOTALL)
@@ -1021,7 +1036,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
url_short = current_page.url[:50] + '...' if len(current_page.url) > 50 else current_page.url
interactive_count = len(browser_state_summary.selector_map) if browser_state_summary else 0
self.logger.info(
- f'📍 Step {self.state.n_steps + 1}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
+ f'📍 Step {self.state.n_steps}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
)
def _log_next_action_summary(self, parsed: 'AgentOutput') -> None:
@@ -1094,7 +1109,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Prepare action_history data correctly
action_history_data = []
- for item in self.state.history.history:
+ for item in self.history.history:
if item.model_output and item.model_output.action:
# Convert each ActionModel in the step to its dictionary representation
step_actions = [
@@ -1107,7 +1122,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Append None or [] if a step had no actions or no model output
action_history_data.append(None)
- final_res = self.state.history.final_result()
+ final_res = self.history.final_result()
final_result_str = json.dumps(final_res) if final_res is not None else None
self.telemetry.capture(
@@ -1125,13 +1140,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
cdp_url=urlparse(self.browser_session.cdp_url).hostname
if self.browser_session and self.browser_session.cdp_url
else None,
- action_errors=self.state.history.errors(),
+ action_errors=self.history.errors(),
action_history=action_history_data,
- urls_visited=self.state.history.urls(),
+ urls_visited=self.history.urls(),
steps=self.state.n_steps,
total_input_tokens=token_summary.prompt_tokens,
- total_duration_seconds=self.state.history.total_duration_seconds(),
- success=self.state.history.is_successful(),
+ total_duration_seconds=self.history.total_duration_seconds(),
+ success=self.history.is_successful(),
final_result_response=final_result_str,
error_message=agent_run_error,
)
@@ -1145,13 +1160,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""
await self.step(step_info)
- if self.state.history.is_done():
+ if self.history.is_done():
await self.log_completion()
if self.register_done_callback:
if inspect.iscoroutinefunction(self.register_done_callback):
- await self.register_done_callback(self.state.history)
+ await self.register_done_callback(self.history)
else:
- self.register_done_callback(self.state.history)
+ self.register_done_callback(self.history)
return True, True
return False, False
@@ -1271,22 +1286,22 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if on_step_end is not None:
await on_step_end(self)
- if self.state.history.is_done():
+ if self.history.is_done():
self.logger.debug(f'🎯 Task completed after {step + 1} steps!')
await self.log_completion()
if self.register_done_callback:
if inspect.iscoroutinefunction(self.register_done_callback):
- await self.register_done_callback(self.state.history)
+ await self.register_done_callback(self.history)
else:
- self.register_done_callback(self.state.history)
+ self.register_done_callback(self.history)
# Task completed
break
else:
agent_run_error = 'Failed to complete task in maximum steps'
- self.state.history.history.append(
+ self.history.add_item(
AgentHistory(
model_output=None,
result=[ActionResult(error=agent_run_error, include_in_memory=True)],
@@ -1295,7 +1310,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
title='',
tabs=[],
interacted_element=[],
- screenshot=None,
+ screenshot_path=None,
),
metadata=None,
)
@@ -1304,23 +1319,23 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.info(f'❌ {agent_run_error}')
self.logger.debug('📊 Collecting usage summary...')
- self.state.history.usage = await self.token_cost_service.get_usage_summary()
+ self.history.usage = await self.token_cost_service.get_usage_summary()
# set the model output schema and call it on the fly
- if self.state.history._output_model_schema is None and self.output_model_schema is not None:
- self.state.history._output_model_schema = self.output_model_schema
+ if self.history._output_model_schema is None and self.output_model_schema is not None:
+ self.history._output_model_schema = self.output_model_schema
self.logger.debug('🏁 Agent.run() completed successfully')
- return self.state.history
+ return self.history
except KeyboardInterrupt:
# Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
self.logger.info('Got KeyboardInterrupt during execution, returning current history')
agent_run_error = 'KeyboardInterrupt'
- self.state.history.usage = await self.token_cost_service.get_usage_summary()
+ self.history.usage = await self.token_cost_service.get_usage_summary()
- return self.state.history
+ return self.history
except Exception as e:
self.logger.error(f'Agent run failed with exception: {e}', exc_info=True)
@@ -1359,7 +1374,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Lazy import gif module to avoid heavy startup cost
from browser_use.agent.gif import create_history_gif
- create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
+ create_history_gif(task=self.task, history=self.history, output_path=output_path)
# Only emit output file event if GIF was actually created
if Path(output_path).exists():
@@ -1484,7 +1499,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
async def log_completion(self) -> None:
"""Log the completion of the task"""
- if self.state.history.is_successful():
+ if self.history.is_successful():
self.logger.info('✅ Task completed successfully')
else:
self.logger.info('❌ Task completed without success')
@@ -1618,7 +1633,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
"""Save the history to a file"""
if not file_path:
file_path = 'AgentHistory.json'
- self.state.history.save_to_file(file_path)
+ self.history.save_to_file(file_path)
async def wait_until_resumed(self):
await self._external_pause_event.wait()
@@ -1756,14 +1771,14 @@ class Agent(Generic[Context, AgentStructuredOutput]):
timestamp = datetime.now().isoformat()
# Only declare variables that are used multiple times
- structured_output = self.state.history.structured_output
+ structured_output = self.history.structured_output
structured_output_json = json.dumps(structured_output.model_dump()) if structured_output else None
- final_result = self.state.history.final_result()
+ final_result = self.history.final_result()
git_info = get_git_info()
- action_history = self.state.history.action_history()
- action_errors = self.state.history.errors()
- urls = self.state.history.urls()
- usage = self.state.history.usage
+ action_history = self.history.action_history()
+ action_errors = self.history.errors()
+ urls = self.history.urls()
+ usage = self.history.usage
return {
'trace': {
@@ -1790,10 +1805,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
'final_result_response_truncated': (
final_result[:20000] if final_result and len(final_result) > 20000 else final_result
),
- 'self_report_completed': 1 if self.state.history.is_done() else 0,
- 'self_report_success': 1 if self.state.history.is_successful() else 0,
- 'duration': self.state.history.total_duration_seconds(),
- 'steps_taken': self.state.history.number_of_steps(),
+ 'self_report_completed': 1 if self.history.is_done() else 0,
+ 'self_report_success': 1 if self.history.is_successful() else 0,
+ 'duration': self.history.total_duration_seconds(),
+ 'steps_taken': self.history.number_of_steps(),
'usage': json.dumps(usage.model_dump()) if usage else None,
},
'trace_details': {
@@ -1805,6 +1820,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# AgentHistoryList methods
'structured_output': structured_output_json,
'final_result_response': final_result,
- 'complete_history': _get_complete_history_without_screenshots(self.state.history.model_dump()),
+ 'complete_history': _get_complete_history_without_screenshots(self.history.model_dump()),
},
}
diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py
index 044b23af0..e0a82621f 100644
--- a/browser_use/agent/views.py
+++ b/browser_use/agent/views.py
@@ -36,7 +36,6 @@ class AgentSettings(BaseModel):
max_failures: int = 3
retry_delay: int = 10
validate_output: bool = False
- message_context: str | None = None
generate_gif: bool | str = False
override_system_message: str | None = None
extend_system_message: str | None = None
@@ -56,7 +55,6 @@ class AgentSettings(BaseModel):
use_thinking: bool = True
flash_mode: bool = False # If enabled, disables evaluation_previous_goal and next_goal, and sets use_thinking = False
max_history_items: int = 40
- images_per_step: int = 1
page_extraction_llm: BaseChatModel | None = None
planner_llm: BaseChatModel | None = None
@@ -76,7 +74,6 @@ class AgentState(BaseModel):
n_steps: int = 1
consecutive_failures: int = 0
last_result: list[ActionResult] | None = None
- history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[], usage=None))
last_plan: str | None = None
last_model_output: AgentOutput | None = None
paused: bool = False
@@ -329,6 +326,10 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
"""Representation of the AgentHistoryList object"""
return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})'
+ def add_item(self, history_item: AgentHistory) -> None:
+ """Add a history item to the list"""
+ self.history.append(history_item)
+
def __repr__(self) -> str:
"""Representation of the AgentHistoryList object"""
return self.__str__()
@@ -443,20 +444,39 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
"""Get all unique URLs from history"""
return [h.state.url if h.state.url is not None else None for h in self.history]
- def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
- """Get all screenshots from history"""
+ def screenshot_paths(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
+ """Get all screenshot paths from history"""
if n_last == 0:
return []
if n_last is None:
if return_none_if_not_screenshot:
- return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history]
+ return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history]
else:
- return [h.state.screenshot for h in self.history if h.state.screenshot is not None]
+ return [h.state.screenshot_path for h in self.history if h.state.screenshot_path is not None]
else:
if return_none_if_not_screenshot:
- return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history[-n_last:]]
+ return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history[-n_last:]]
else:
- return [h.state.screenshot for h in self.history[-n_last:] if h.state.screenshot is not None]
+ return [h.state.screenshot_path for h in self.history[-n_last:] if h.state.screenshot_path is not None]
+
+ def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
+ """Get all screenshots from history as base64 strings"""
+ if n_last == 0:
+ return []
+
+ history_items = self.history if n_last is None else self.history[-n_last:]
+ screenshots = []
+
+ for item in history_items:
+ screenshot_b64 = item.state.get_screenshot()
+ if screenshot_b64:
+ screenshots.append(screenshot_b64)
+ else:
+ if return_none_if_not_screenshot:
+ screenshots.append(None)
+ # If return_none_if_not_screenshot is False, we skip None values
+
+ return screenshots
def action_names(self) -> list[str]:
"""Get all action names from history"""
diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py
index 01c4f21f8..4d4a6a5b0 100644
--- a/browser_use/browser/views.py
+++ b/browser_use/browser/views.py
@@ -81,12 +81,31 @@ class BrowserStateHistory:
title: str
tabs: list[TabInfo]
interacted_element: list[DOMHistoryElement | None] | list[None]
- screenshot: str | None = None
+ screenshot_path: str | None = None
+
+ def get_screenshot(self) -> str | None:
+ """Load screenshot from disk and return as base64 string"""
+ if not self.screenshot_path:
+ return None
+
+ import base64
+ from pathlib import Path
+
+ path_obj = Path(self.screenshot_path)
+ if not path_obj.exists():
+ return None
+
+ try:
+ with open(path_obj, 'rb') as f:
+ screenshot_data = f.read()
+ return base64.b64encode(screenshot_data).decode('utf-8')
+ except Exception:
+ return None
def to_dict(self) -> dict[str, Any]:
data = {}
data['tabs'] = [tab.model_dump() for tab in self.tabs]
- data['screenshot'] = self.screenshot
+ data['screenshot_path'] = self.screenshot_path
data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
data['url'] = self.url
data['title'] = self.title
diff --git a/browser_use/cli.py b/browser_use/cli.py
index 6fe29f8e8..56fd8ff23 100644
--- a/browser_use/cli.py
+++ b/browser_use/cli.py
@@ -815,18 +815,18 @@ class BrowserUseApp(App):
# Show token usage statistics if agent exists and has history
if self.agent and hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
# Get total tokens used
- # total_tokens = self.agent.state.history.total_input_tokens()
+ # total_tokens = self.agent.history.total_input_tokens()
# model_info.write(f'[white]Input tokens:[/] [green]{total_tokens:,}[/]')
# Calculate tokens per step
- num_steps = len(self.agent.state.history.history)
+ num_steps = len(self.agent.history.history)
# if num_steps > 0:
# avg_tokens_per_step = total_tokens / num_steps
# model_info.write(f'[white]Avg tokens/step:[/] [green]{avg_tokens_per_step:,.1f}[/]')
# Get the last step metadata to show the most recent LLM response time
- if num_steps > 0 and self.agent.state.history.history[-1].metadata:
- last_step = self.agent.state.history.history[-1]
+ if num_steps > 0 and self.agent.history.history[-1].metadata:
+ last_step = self.agent.history.history[-1]
if last_step.metadata:
step_duration = last_step.metadata.duration_seconds
else:
@@ -838,7 +838,7 @@ class BrowserUseApp(App):
# model_info.write(f'[white]Avg tokens/sec:[/] [magenta]{tokens_per_second:.1f}[/]')
# Show total duration
- total_duration = self.agent.state.history.total_duration_seconds()
+ total_duration = self.agent.history.total_duration_seconds()
if total_duration > 0:
model_info.write(f'[white]Total Duration:[/] [magenta]{total_duration:.2f}s[/]')
@@ -891,7 +891,7 @@ class BrowserUseApp(App):
# Get all agent history items
history_items = []
if hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
- history_items = self.agent.state.history.history
+ history_items = self.agent.history.history
if history_items:
tasks_info.write('[bold yellow]STEPS:[/]')
diff --git a/browser_use/llm/google/chat.py b/browser_use/llm/google/chat.py
index 95423f584..66ba74b90 100644
--- a/browser_use/llm/google/chat.py
+++ b/browser_use/llm/google/chat.py
@@ -371,6 +371,7 @@ class ChatGoogle(BaseChatModel):
key == 'properties'
and isinstance(cleaned_value, dict)
and len(cleaned_value) == 0
+ and isinstance(obj.get('type', ''), str)
and obj.get('type', '').upper() == 'OBJECT'
):
# Convert empty object to have at least one property
@@ -380,7 +381,8 @@ class ChatGoogle(BaseChatModel):
# If this is an object type with empty properties, add a placeholder
if (
- cleaned.get('type', '').upper() == 'OBJECT'
+ isinstance(cleaned.get('type', ''), str)
+ and cleaned.get('type', '').upper() == 'OBJECT'
and 'properties' in cleaned
and isinstance(cleaned['properties'], dict)
and len(cleaned['properties']) == 0
diff --git a/browser_use/screenshots/__init__.py b/browser_use/screenshots/__init__.py
new file mode 100644
index 000000000..0e721412c
--- /dev/null
+++ b/browser_use/screenshots/__init__.py
@@ -0,0 +1 @@
+# Screenshots package for browser-use
diff --git a/browser_use/screenshots/service.py b/browser_use/screenshots/service.py
new file mode 100644
index 000000000..b6929d83b
--- /dev/null
+++ b/browser_use/screenshots/service.py
@@ -0,0 +1,48 @@
+"""
+Screenshot storage service for browser-use agents.
+"""
+
+import base64
+from pathlib import Path
+
+import anyio
+
+
+class ScreenshotService:
+ """Simple screenshot storage service that saves screenshots to disk"""
+
+ def __init__(self, agent_directory: str | Path):
+ """Initialize with agent directory path"""
+ self.agent_directory = Path(agent_directory) if isinstance(agent_directory, str) else agent_directory
+
+ # Create screenshots subdirectory
+ self.screenshots_dir = self.agent_directory / 'screenshots'
+ self.screenshots_dir.mkdir(parents=True, exist_ok=True)
+
+ async def store_screenshot(self, screenshot_b64: str, step_number: int) -> str:
+ """Store screenshot to disk and return the full path as string"""
+ screenshot_filename = f'step_{step_number}.png'
+ screenshot_path = self.screenshots_dir / screenshot_filename
+
+ # Decode base64 and save to disk
+ screenshot_data = base64.b64decode(screenshot_b64)
+
+ async with await anyio.open_file(screenshot_path, 'wb') as f:
+ await f.write(screenshot_data)
+
+ return str(screenshot_path)
+
+ async def get_screenshot(self, screenshot_path: str) -> str | None:
+ """Load screenshot from disk path and return as base64"""
+ if not screenshot_path:
+ return None
+
+ path = Path(screenshot_path)
+ if not path.exists():
+ return None
+
+ # Load from disk and encode to base64
+ async with await anyio.open_file(path, 'rb') as f:
+ screenshot_data = await f.read()
+
+ return base64.b64encode(screenshot_data).decode('utf-8')
diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx
index 77f7b8dc1..754e01b4f 100644
--- a/docs/customize/agent-settings.mdx
+++ b/docs/customize/agent-settings.mdx
@@ -133,7 +133,7 @@ history = await agent.run()
# Access (some) useful information
history.urls() # List of visited URLs
-history.screenshots() # List of screenshot paths
+history.screenshot_paths() # List of screenshot paths
history.action_names() # Names of executed actions
history.extracted_content() # Content extracted during execution
history.errors() # Any errors that occurred
@@ -173,60 +173,12 @@ agent = Agent(
)
```
-## Run with message context
-You can configure the agent and provide a separate message to help the LLM understand the task better.
-```python
-from browser_use.llm import ChatOpenAI
-agent = Agent(
- task="your task",
- message_context="Additional information about the task",
- llm = ChatOpenAI(model='gpt-4o')
-)
-```
-
-## Run with planner model
-
-You can configure the agent to use a separate planner model for high-level task planning:
-
-```python
-from browser_use.llm import ChatOpenAI
-
-# Initialize models
-llm = ChatOpenAI(model='gpt-4o')
-planner_llm = ChatOpenAI(model='o3-mini')
-
-agent = Agent(
- task="your task",
- llm=llm,
- planner_llm=planner_llm, # Separate model for planning
- use_vision_for_planner=False, # Disable vision for planner
- planner_interval=4 # Plan every 4 steps
-)
-```
-
-### Planner Parameters
-
-- `planner_llm`: A chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM.
-- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`.
-- `planner_interval`: Number of steps between planning phases. Defaults to `1`.
-
-Using a separate planner model can help:
-
-- Reduce costs by using a smaller model for high-level planning
-- Improve task decomposition and strategic thinking
-- Better handle complex, multi-step tasks
-
-
- The planner model is optional. If not specified, the agent will not use the
- planner model.
-
### Optional Parameters
-- `message_context`: Additional information about the task to help the LLM understand the task better.
- `initial_actions`: List of initial actions to run before the main task.
- `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`.
- `max_failures`: Maximum number of failures before giving up. Defaults to `3`.
diff --git a/docs/customize/hooks.mdx b/docs/customize/hooks.mdx
index d27673a58..92c0b007b 100644
--- a/docs/customize/hooks.mdx
+++ b/docs/customize/hooks.mdx
@@ -41,7 +41,7 @@ async def my_step_hook(agent: Agent):
# https://playwright.dev/python/docs/api/class-page
current_url = page.url
- visit_log = agent.state.history.urls()
+ visit_log = agent.history.urls()
previous_url = visit_log[-2] if len(visit_log) >= 2 else None
print(f"Agent was last on URL: {previous_url} and is now on {current_url}")
@@ -91,11 +91,12 @@ When working with agent hooks, you have access to the entire `Agent` instance. H
- `agent.settings` contains all the configuration options passed to the `Agent(...)` at init time
- `agent.llm` gives direct access to the main LLM object (e.g. `ChatOpenAI`)
- `agent.state` gives access to lots of internal state, including agent thoughts, outputs, actions, etc.
- - `agent.state.history.model_thoughts()`: Reasoning from Browser Use's model.
- - `agent.state.history.model_outputs()`: Raw outputs from the Browsre Use's model.
- - `agent.state.history.model_actions()`: Actions taken by the agent
- - `agent.state.history.extracted_content()`: Content extracted from web pages
- - `agent.state.history.urls()`: URLs visited by the agent
+- `agent.history` gives access to historical data from the agent's execution:
+ - `agent.history.model_thoughts()`: Reasoning from Browser Use's model.
+ - `agent.history.model_outputs()`: Raw outputs from the Browser Use's model.
+ - `agent.history.model_actions()`: Actions taken by the agent
+ - `agent.history.extracted_content()`: Content extracted from web pages
+ - `agent.history.urls()`: URLs visited by the agent
- `agent.browser_session` gives direct access to the `BrowserSession()` and playwright objects
- `agent.browser_session.get_current_page()`: Get the current playwright `Page` object the agent is focused on
- `agent.browser_session.browser_context`: Get the current playwright `BrowserContext` object
diff --git a/examples/custom-functions/custom_hooks_before_after_step.py b/examples/custom-functions/custom_hooks_before_after_step.py
index bd40acbbf..9053248ff 100644
--- a/examples/custom-functions/custom_hooks_before_after_step.py
+++ b/examples/custom-functions/custom_hooks_before_after_step.py
@@ -154,7 +154,7 @@ async def record_activity(agent_obj):
print('--> History:')
# Assert agent has state to satisfy type checker
assert hasattr(agent_obj, 'state'), 'Agent must have state attribute'
- history = agent_obj.state.history
+ history = agent_obj.history
model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)
@@ -164,7 +164,7 @@ async def record_activity(agent_obj):
# prettyprinter.cpprint(model_thoughts_last_elem)
# print("--- MODEL OUTPUT ACTION ---")
- model_outputs = agent_obj.state.history.model_outputs()
+ model_outputs = agent_obj.history.model_outputs()
model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False)
if len(model_outputs_json) > 0:
@@ -172,7 +172,7 @@ async def record_activity(agent_obj):
# prettyprinter.cpprint(model_outputs_json_last_elem)
# print("--- MODEL INTERACTED ELEM ---")
- model_actions = agent_obj.state.history.model_actions()
+ model_actions = agent_obj.history.model_actions()
model_actions_json = obj_to_json(obj=model_actions, check_circular=False)
if len(model_actions_json) > 0:
@@ -180,14 +180,14 @@ async def record_activity(agent_obj):
# prettyprinter.cpprint(model_actions_json_last_elem)
# print("--- EXTRACTED CONTENT ---")
- extracted_content = agent_obj.state.history.extracted_content()
+ extracted_content = agent_obj.history.extracted_content()
extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False)
if len(extracted_content_json) > 0:
extracted_content_json_last_elem = extracted_content_json[-1]
# prettyprinter.cpprint(extracted_content_json_last_elem)
# print("--- URLS ---")
- urls = agent_obj.state.history.urls()
+ urls = agent_obj.history.urls()
# prettyprinter.cpprint(urls)
urls_json = obj_to_json(obj=urls, check_circular=False)
diff --git a/examples/features/outsource_state.py b/examples/features/outsource_state.py
index a030c8b91..d99dbc632 100644
--- a/examples/features/outsource_state.py
+++ b/examples/features/outsource_state.py
@@ -47,8 +47,6 @@ async def main():
if done and valid:
break
- agent_state.history.history = []
-
# Save state to file
async with await anyio.open_file('agent_state.json', 'w') as f:
serialized = agent_state.model_dump_json(exclude={'history'})
diff --git a/examples/models/deepseek-chat.py b/examples/models/deepseek-chat.py
index 0d3b86c22..cf05ceace 100644
--- a/examples/models/deepseek-chat.py
+++ b/examples/models/deepseek-chat.py
@@ -28,7 +28,7 @@ async def main():
task='What should we pay attention to in the recent new rules on tariffs in China-US trade?',
llm=llm,
use_vision=False,
- message_context=extend_system_message,
+ extend_system_message=extend_system_message,
)
await agent.run()
diff --git a/pyproject.toml b/pyproject.toml
index d23fb5b81..c8aecbf34 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
name = "browser-use"
description = "Make websites accessible for AI agents"
authors = [{ name = "Gregor Zunic" }]
-version = "0.5.7"
+version = "0.5.9"
readme = "README.md"
requires-python = ">=3.11,<4.0"
classifiers = [
diff --git a/tests/ci/test_gemini_type_field_fix.py b/tests/ci/test_gemini_type_field_fix.py
new file mode 100644
index 000000000..bca1ef716
--- /dev/null
+++ b/tests/ci/test_gemini_type_field_fix.py
@@ -0,0 +1,107 @@
+"""
+Test to reproduce and verify fix for GitHub issue #2470:
+"Python field with name 'type' handled differently between Gemini and OpenAI GPT"
+"""
+
+from browser_use.llm.google.chat import ChatGoogle
+from browser_use.llm.schema import SchemaOptimizer
+
+
+class TestGeminiTypeFieldHandling:
+ """Test class for reproducing the type field issue with Gemini schema processing."""
+
+ def test_gemini_schema_with_dict_type_field(self):
+ """
+ Test that Gemini schema processing handles dict 'type' field gracefully.
+ Reproduces the AttributeError: 'dict' object has no attribute 'upper'
+ """
+ chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+ # Schema with dict instead of string in type field
+ problematic_schema = {'type': {'malformed': 'dict_type'}, 'properties': {}}
+
+ result = chat_google._fix_gemini_schema(problematic_schema)
+ assert result is not None
+ assert isinstance(result, dict)
+ assert result['type'] == {'malformed': 'dict_type'}
+
+ def test_gemini_schema_with_nested_dict_type_field(self):
+ """
+ Test that nested dict 'type' fields are handled gracefully.
+ """
+ chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+ # Schema with nested dict type field
+ problematic_schema = {
+ 'type': 'object',
+ 'properties': {'nested_field': {'type': {'malformed': 'dict_instead_of_string'}, 'properties': {}}},
+ }
+
+ result = chat_google._fix_gemini_schema(problematic_schema)
+ assert result is not None
+ assert isinstance(result, dict)
+ nested_type = result['properties']['nested_field']['type']
+ assert nested_type == {'malformed': 'dict_instead_of_string'}
+
+ def test_gemini_schema_with_none_type_field(self):
+ """Test handling of None type field."""
+ chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+ problematic_schema = {'type': 'object', 'properties': {'nested_field': {'type': None, 'properties': {}}}}
+
+ result = chat_google._fix_gemini_schema(problematic_schema)
+ assert result is not None
+
+ def test_gemini_schema_with_valid_string_type(self):
+ """Test that valid string type fields work correctly."""
+ chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+ valid_schema = {'type': 'object', 'properties': {'nested_field': {'type': 'object', 'properties': {}}}}
+
+ # Should work without issues
+ result = chat_google._fix_gemini_schema(valid_schema)
+ assert result is not None
+ assert isinstance(result, dict)
+
+ def test_gemini_schema_with_empty_properties_object(self):
+ """Test handling of empty properties in object type."""
+ chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+ schema_with_empty_props = {
+ 'type': 'object',
+ 'properties': {
+ 'empty_object': {
+ 'type': 'object',
+ 'properties': {}, # Empty properties should get placeholder
+ }
+ },
+ }
+
+ result = chat_google._fix_gemini_schema(schema_with_empty_props)
+
+ nested_props = result['properties']['empty_object']['properties']
+ assert '_placeholder' in nested_props
+ assert nested_props['_placeholder']['type'] == 'string'
+
+ def test_consistency_between_providers(self):
+ """
+ Test that both Gemini and OpenAI handle schemas consistently.
+ The original issue was that Gemini would fail where OpenAI succeeded.
+ """
+ from pydantic import BaseModel, Field
+
+ # Create a test model that generates a schema with dict type
+ class TestModel(BaseModel):
+ field_with_dict_type: dict = Field(default_factory=dict)
+
+ # OpenAI uses SchemaOptimizer directly
+ openai_schema = SchemaOptimizer.create_optimized_json_schema(TestModel)
+ assert openai_schema is not None
+
+ # Gemini processes the schema through _fix_gemini_schema
+ chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+ gemini_result = chat_google._fix_gemini_schema(openai_schema)
+ assert gemini_result is not None
+
+ # Both should handle the schema without errors
+ # This demonstrates that the fix makes Gemini consistent with OpenAI
diff --git a/tests/ci/test_gif_filtering.py b/tests/ci/test_gif_filtering.py
index 8c8450354..ecb6ab6dd 100644
--- a/tests/ci/test_gif_filtering.py
+++ b/tests/ci/test_gif_filtering.py
@@ -9,7 +9,7 @@ from PIL import Image
from browser_use import AgentHistoryList
from browser_use.agent.gif import create_history_gif
from browser_use.agent.views import ActionResult, AgentHistory, AgentOutput
-from browser_use.browser.views import PLACEHOLDER_4PX_SCREENSHOT, BrowserStateHistory, TabInfo
+from browser_use.browser.views import BrowserStateHistory, TabInfo
@pytest.fixture
@@ -49,9 +49,22 @@ def create_test_screenshot(width: int = 800, height: int = 600, color: tuple = (
async def test_gif_filters_out_placeholder_screenshots(test_dir):
"""Test that 4px placeholder screenshots from about:blank pages are filtered out of GIFs."""
+ # Set up screenshot service for testing (still needed to create test files)
+ from browser_use.screenshots.service import ScreenshotService
+
+ screenshot_service = ScreenshotService(test_dir)
+
+ # Helper function to store test screenshots
+ async def store_test_screenshot(screenshot_b64: str, step: int) -> str:
+ return await screenshot_service.store_screenshot(screenshot_b64, step)
+
# Create a history with mixed screenshots: real and placeholder
history_items = []
+ # Store test screenshots
+ real_screenshot_1_path = await store_test_screenshot(create_test_screenshot(800, 600, (100, 150, 200)), 2)
+ real_screenshot_2_path = await store_test_screenshot(create_test_screenshot(800, 600, (200, 100, 50)), 4)
+
# First item: about:blank placeholder (should be filtered)
history_items.append(
AgentHistory(
@@ -63,7 +76,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
- screenshot=PLACEHOLDER_4PX_SCREENSHOT,
+ screenshot_path=None, # Placeholder doesn't have a file path
url='about:blank',
title='New Tab',
tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],
@@ -83,7 +96,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
- screenshot=create_test_screenshot(800, 600, (100, 150, 200)),
+ screenshot_path=real_screenshot_1_path,
url='https://example.com',
title='Example',
tabs=[TabInfo(page_id=1, url='https://example.com', title='Example')],
@@ -103,7 +116,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
- screenshot=PLACEHOLDER_4PX_SCREENSHOT,
+ screenshot_path=None, # Placeholder doesn't have a file path
url='about:blank',
title='New Tab',
tabs=[TabInfo(page_id=2, url='about:blank', title='New Tab')],
@@ -123,7 +136,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
- screenshot=create_test_screenshot(800, 600, (200, 100, 50)),
+ screenshot_path=real_screenshot_2_path,
url='https://example.com/page2',
title='Page 2',
tabs=[TabInfo(page_id=1, url='https://example.com/page2', title='Page 2')],
@@ -190,7 +203,7 @@ async def test_gif_handles_all_placeholders(test_dir):
),
result=[ActionResult()],
state=BrowserStateHistory(
- screenshot=PLACEHOLDER_4PX_SCREENSHOT,
+ screenshot_path=None, # Placeholder doesn't have a file path
url='about:blank',
title='New Tab',
tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],
diff --git a/tests/ci/test_gif_generation_with_navigation.py b/tests/ci/test_gif_generation_with_navigation.py
index fae6bc32e..31ff6dde0 100644
--- a/tests/ci/test_gif_generation_with_navigation.py
+++ b/tests/ci/test_gif_generation_with_navigation.py
@@ -85,9 +85,10 @@ async def test_gif_generation_with_real_navigation(httpserver, tmp_path):
# Verify history contains real screenshots (not placeholders)
has_real_screenshot = False
for item in history.history:
+ screenshot_b64 = item.state.get_screenshot()
if (
- item.state.screenshot
- and item.state.screenshot
+ screenshot_b64
+ and screenshot_b64
!= 'iVBORw0KGgoAAAANSUhEUgAAAAQAAAAECAIAAAAmkwkpAAAAFElEQVR4nGP8//8/AwwwMSAB3BwAlm4DBfIlvvkAAAAASUVORK5CYII='
):
has_real_screenshot = True
diff --git a/tests/ci/test_sync_agent_events.py b/tests/ci/test_sync_agent_events.py
index fb6500048..0dc941157 100644
--- a/tests/ci/test_sync_agent_events.py
+++ b/tests/ci/test_sync_agent_events.py
@@ -88,7 +88,7 @@ class TestAgentEventLifecycle:
assert isinstance(step_event, CreateAgentStepEvent)
assert step_event.agent_task_id == task_event.id
- assert step_event.step == 2 # Step is incremented before event is emitted
+ assert step_event.step == 1 # Step is incremented before event is emitted
assert step_event.url == httpserver.url_for('/')
assert isinstance(update_event, UpdateAgentTaskEvent)
diff --git a/tests/old/test_core_functionality.py b/tests/old/test_core_functionality.py
index 73cd82d42..8da0a134c 100644
--- a/tests/old/test_core_functionality.py
+++ b/tests/old/test_core_functionality.py
@@ -279,6 +279,6 @@ class TestCoreFunctionality:
assert final_scroll_position > initial_scroll_position, 'Page did not scroll down'
# Verify the action was executed
- history = agent.state.history
+ history = agent.history
action_names = history.action_names()
assert 'scroll_down' in action_names