From 54050248898223dcb26848f48a72b39af14e25a4 Mon Sep 17 00:00:00 2001 From: Jai <89634744+jaiganeshs21@users.noreply.github.com> Date: Sun, 27 Jul 2025 10:55:47 +0530 Subject: [PATCH 01/13] fix: handle non-string type fields in Gemini schema processing --- browser_use/llm/google/chat.py | 4 +- tests/ci/test_gemini_type_field_fix.py | 133 +++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 tests/ci/test_gemini_type_field_fix.py diff --git a/browser_use/llm/google/chat.py b/browser_use/llm/google/chat.py index 7a2a7329b..a8159c6e9 100644 --- a/browser_use/llm/google/chat.py +++ b/browser_use/llm/google/chat.py @@ -363,6 +363,7 @@ class ChatGoogle(BaseChatModel): key == 'properties' and isinstance(cleaned_value, dict) and len(cleaned_value) == 0 + and isinstance(obj.get('type', ''), str) and obj.get('type', '').upper() == 'OBJECT' ): # Convert empty object to have at least one property @@ -372,7 +373,8 @@ class ChatGoogle(BaseChatModel): # If this is an object type with empty properties, add a placeholder if ( - cleaned.get('type', '').upper() == 'OBJECT' + isinstance(cleaned.get('type', ''), str) + and cleaned.get('type', '').upper() == 'OBJECT' and 'properties' in cleaned and isinstance(cleaned['properties'], dict) and len(cleaned['properties']) == 0 diff --git a/tests/ci/test_gemini_type_field_fix.py b/tests/ci/test_gemini_type_field_fix.py new file mode 100644 index 000000000..61f855281 --- /dev/null +++ b/tests/ci/test_gemini_type_field_fix.py @@ -0,0 +1,133 @@ +""" +Test to reproduce and verify fix for GitHub issue #2470: +"Python field with name 'type' handled differently between Gemini and OpenAI GPT" +""" + +import pytest +from browser_use.llm.google.chat import ChatGoogle +from browser_use.llm.openai.chat import ChatOpenAI +from browser_use.llm.schema import SchemaOptimizer + + +class TestGeminiTypeFieldHandling: + """Test class for reproducing the type field issue with Gemini schema processing.""" + + def test_gemini_schema_with_dict_type_field(self): + """ + Test that Gemini schema processing handles dict 'type' field gracefully. + Reproduces the AttributeError: 'dict' object has no attribute 'upper' + """ + chat_google = ChatGoogle(model='gemini-2.0-flash-exp') + + # Schema with dict instead of string in type field + problematic_schema = { + 'type': {'malformed': 'dict_type'}, + 'properties': {} + } + + result = chat_google._fix_gemini_schema(problematic_schema) + assert result is not None + assert isinstance(result, dict) + assert result['type'] == {'malformed': 'dict_type'} + + def test_gemini_schema_with_nested_dict_type_field(self): + """ + Test that nested dict 'type' fields are handled gracefully. + """ + chat_google = ChatGoogle(model='gemini-2.0-flash-exp') + + # Schema with nested dict type field + problematic_schema = { + 'type': 'object', + 'properties': { + 'nested_field': { + 'type': {'malformed': 'dict_instead_of_string'}, + 'properties': {} + } + } + } + + result = chat_google._fix_gemini_schema(problematic_schema) + assert result is not None + assert isinstance(result, dict) + nested_type = result['properties']['nested_field']['type'] + assert nested_type == {'malformed': 'dict_instead_of_string'} + + def test_gemini_schema_with_none_type_field(self): + """Test handling of None type field.""" + chat_google = ChatGoogle(model='gemini-2.0-flash-exp') + + problematic_schema = { + 'type': 'object', + 'properties': { + 'nested_field': { + 'type': None, + 'properties': {} + } + } + } + + result = chat_google._fix_gemini_schema(problematic_schema) + assert result is not None + + def test_gemini_schema_with_valid_string_type(self): + """Test that valid string type fields work correctly.""" + chat_google = ChatGoogle(model='gemini-2.0-flash-exp') + + valid_schema = { + 'type': 'object', + 'properties': { + 'nested_field': { + 'type': 'object', + 'properties': {} + } + } + } + + # Should work without issues + result = chat_google._fix_gemini_schema(valid_schema) + assert result is not None + assert isinstance(result, dict) + + def test_gemini_schema_with_empty_properties_object(self): + """Test handling of empty properties in object type.""" + chat_google = ChatGoogle(model='gemini-2.0-flash-exp') + + schema_with_empty_props = { + 'type': 'object', + 'properties': { + 'empty_object': { + 'type': 'object', + 'properties': {} # Empty properties should get placeholder + } + } + } + + result = chat_google._fix_gemini_schema(schema_with_empty_props) + + nested_props = result['properties']['empty_object']['properties'] + assert '_placeholder' in nested_props + assert nested_props['_placeholder']['type'] == 'string' + + def test_consistency_between_providers(self): + """ + Test that both Gemini and OpenAI handle schemas consistently. + The original issue was that Gemini would fail where OpenAI succeeded. + """ + from pydantic import BaseModel, Field + + # Create a test model that generates a schema with dict type + class TestModel(BaseModel): + field_with_dict_type: dict = Field(default_factory=dict) + + # OpenAI uses SchemaOptimizer directly + openai_schema = SchemaOptimizer.create_optimized_json_schema(TestModel) + assert openai_schema is not None + + # Gemini processes the schema through _fix_gemini_schema + chat_google = ChatGoogle(model='gemini-2.0-flash-exp') + gemini_result = chat_google._fix_gemini_schema(openai_schema) + assert gemini_result is not None + + # Both should handle the schema without errors + # This demonstrates that the fix makes Gemini consistent with OpenAI \ No newline at end of file From 42a176a3f2bef793334990f77f3932991d9ac44d Mon Sep 17 00:00:00 2001 From: Jai <89634744+jaiganeshs21@users.noreply.github.com> Date: Sun, 27 Jul 2025 11:05:19 +0530 Subject: [PATCH 02/13] fix pre commit code quality checks --- tests/ci/test_gemini_type_field_fix.py | 70 ++++++++------------------ 1 file changed, 22 insertions(+), 48 deletions(-) diff --git a/tests/ci/test_gemini_type_field_fix.py b/tests/ci/test_gemini_type_field_fix.py index 61f855281..bca1ef716 100644 --- a/tests/ci/test_gemini_type_field_fix.py +++ b/tests/ci/test_gemini_type_field_fix.py @@ -3,9 +3,7 @@ Test to reproduce and verify fix for GitHub issue #2470: "Python field with name 'type' handled differently between Gemini and OpenAI GPT" """ -import pytest from browser_use.llm.google.chat import ChatGoogle -from browser_use.llm.openai.chat import ChatOpenAI from browser_use.llm.schema import SchemaOptimizer @@ -18,13 +16,10 @@ class TestGeminiTypeFieldHandling: Reproduces the AttributeError: 'dict' object has no attribute 'upper' """ chat_google = ChatGoogle(model='gemini-2.0-flash-exp') - + # Schema with dict instead of string in type field - problematic_schema = { - 'type': {'malformed': 'dict_type'}, - 'properties': {} - } - + problematic_schema = {'type': {'malformed': 'dict_type'}, 'properties': {}} + result = chat_google._fix_gemini_schema(problematic_schema) assert result is not None assert isinstance(result, dict) @@ -35,18 +30,13 @@ class TestGeminiTypeFieldHandling: Test that nested dict 'type' fields are handled gracefully. """ chat_google = ChatGoogle(model='gemini-2.0-flash-exp') - + # Schema with nested dict type field problematic_schema = { 'type': 'object', - 'properties': { - 'nested_field': { - 'type': {'malformed': 'dict_instead_of_string'}, - 'properties': {} - } - } + 'properties': {'nested_field': {'type': {'malformed': 'dict_instead_of_string'}, 'properties': {}}}, } - + result = chat_google._fix_gemini_schema(problematic_schema) assert result is not None assert isinstance(result, dict) @@ -56,34 +46,18 @@ class TestGeminiTypeFieldHandling: def test_gemini_schema_with_none_type_field(self): """Test handling of None type field.""" chat_google = ChatGoogle(model='gemini-2.0-flash-exp') - - problematic_schema = { - 'type': 'object', - 'properties': { - 'nested_field': { - 'type': None, - 'properties': {} - } - } - } - + + problematic_schema = {'type': 'object', 'properties': {'nested_field': {'type': None, 'properties': {}}}} + result = chat_google._fix_gemini_schema(problematic_schema) assert result is not None def test_gemini_schema_with_valid_string_type(self): """Test that valid string type fields work correctly.""" chat_google = ChatGoogle(model='gemini-2.0-flash-exp') - - valid_schema = { - 'type': 'object', - 'properties': { - 'nested_field': { - 'type': 'object', - 'properties': {} - } - } - } - + + valid_schema = {'type': 'object', 'properties': {'nested_field': {'type': 'object', 'properties': {}}}} + # Should work without issues result = chat_google._fix_gemini_schema(valid_schema) assert result is not None @@ -92,19 +66,19 @@ class TestGeminiTypeFieldHandling: def test_gemini_schema_with_empty_properties_object(self): """Test handling of empty properties in object type.""" chat_google = ChatGoogle(model='gemini-2.0-flash-exp') - + schema_with_empty_props = { 'type': 'object', 'properties': { 'empty_object': { 'type': 'object', - 'properties': {} # Empty properties should get placeholder + 'properties': {}, # Empty properties should get placeholder } - } + }, } - + result = chat_google._fix_gemini_schema(schema_with_empty_props) - + nested_props = result['properties']['empty_object']['properties'] assert '_placeholder' in nested_props assert nested_props['_placeholder']['type'] == 'string' @@ -115,19 +89,19 @@ class TestGeminiTypeFieldHandling: The original issue was that Gemini would fail where OpenAI succeeded. """ from pydantic import BaseModel, Field - + # Create a test model that generates a schema with dict type class TestModel(BaseModel): field_with_dict_type: dict = Field(default_factory=dict) - + # OpenAI uses SchemaOptimizer directly openai_schema = SchemaOptimizer.create_optimized_json_schema(TestModel) assert openai_schema is not None - + # Gemini processes the schema through _fix_gemini_schema chat_google = ChatGoogle(model='gemini-2.0-flash-exp') gemini_result = chat_google._fix_gemini_schema(openai_schema) assert gemini_result is not None - + # Both should handle the schema without errors - # This demonstrates that the fix makes Gemini consistent with OpenAI \ No newline at end of file + # This demonstrates that the fix makes Gemini consistent with OpenAI From fd635c896c4692ca97b67590ee839c9941e45b73 Mon Sep 17 00:00:00 2001 From: mertunsall Date: Fri, 1 Aug 2025 19:47:30 +0100 Subject: [PATCH 03/13] fix step logging --- browser_use/agent/service.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 8969acf56..ba1361921 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -698,20 +698,20 @@ class Agent(Generic[Context, AgentStructuredOutput]): assert self.browser_session is not None, 'BrowserSession is not set up' - self.logger.debug(f'🌐 Step {self.state.n_steps + 1}: Getting browser state...') + self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...') browser_state_summary = await self.browser_session.get_browser_state_with_recovery( cache_clickable_elements_hashes=True, include_screenshot=self.settings.use_vision ) current_page = await self.browser_session.get_current_page() # Check for new downloads after getting browser state (catches PDF auto-downloads and previous step downloads) - await self._check_and_update_downloads(f'Step {self.state.n_steps + 1}: after getting browser state') + await self._check_and_update_downloads(f'Step {self.state.n_steps}: after getting browser state') self._log_step_context(current_page, browser_state_summary) await self._raise_if_stopped_or_paused() # Update action models with page-specific actions - self.logger.debug(f'📝 Step {self.state.n_steps + 1}: Updating action models...') + self.logger.debug(f'📝 Step {self.state.n_steps}: Updating action models...') await self._update_action_models_for_page(current_page) # Get page-specific filtered actions @@ -722,7 +722,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): page_action_message = f'For this page, these additional actions are available:\n{page_filtered_actions}' self._message_manager._add_message_with_type(UserMessage(content=page_action_message), 'consistent') - self.logger.debug(f'💬 Step {self.state.n_steps + 1}: Adding state message to context...') + self.logger.debug(f'💬 Step {self.state.n_steps}: Adding state message to context...') self._message_manager.add_state_message( browser_state_summary=browser_state_summary, model_output=self.state.last_model_output, @@ -743,7 +743,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): """Execute LLM interaction with retry logic and handle callbacks""" input_messages = self._message_manager.get_messages() self.logger.debug( - f'🤖 Step {self.state.n_steps + 1}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...' + f'🤖 Step {self.state.n_steps}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...' ) try: @@ -760,7 +760,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Check again for paused/stopped state after getting model output await self._raise_if_stopped_or_paused() - # Increment step counter at the start of each step + # Increment step counter at the end of each step self.state.n_steps += 1 # Handle callbacks and conversation saving @@ -895,7 +895,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): """Get model output with retry logic for empty actions""" model_output = await self.get_model_output(input_messages) self.logger.debug( - f'✅ Step {self.state.n_steps + 1}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions' + f'✅ Step {self.state.n_steps}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions' ) if ( @@ -1023,7 +1023,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): url_short = current_page.url[:50] + '...' if len(current_page.url) > 50 else current_page.url interactive_count = len(browser_state_summary.selector_map) if browser_state_summary else 0 self.logger.info( - f'📍 Step {self.state.n_steps + 1}: Evaluating page with {interactive_count} interactive elements on: {url_short}' + f'📍 Step {self.state.n_steps}: Evaluating page with {interactive_count} interactive elements on: {url_short}' ) def _log_next_action_summary(self, parsed: 'AgentOutput') -> None: From ed965ac6c20bee3375c68ee0b8b42e87193dbee2 Mon Sep 17 00:00:00 2001 From: mertunsall Date: Fri, 1 Aug 2025 20:18:57 +0100 Subject: [PATCH 04/13] increment step at the end --- browser_use/agent/service.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index ba1361921..5ee89c9fb 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -760,9 +760,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Check again for paused/stopped state after getting model output await self._raise_if_stopped_or_paused() - # Increment step counter at the end of each step - self.state.n_steps += 1 - # Handle callbacks and conversation saving await self._handle_post_llm_processing(browser_state_summary, input_messages) @@ -879,6 +876,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): ) self.eventbus.dispatch(step_event) + # Increment step counter after step is fully completed + self.state.n_steps += 1 + async def _handle_final_step(self, step_info: AgentStepInfo | None = None) -> None: """Handle special processing for the last step""" if step_info and step_info.is_last_step(): From fcae1702a9323643a6e9a5d3bde0bbc3dc95224c Mon Sep 17 00:00:00 2001 From: mertunsall Date: Fri, 1 Aug 2025 20:56:54 +0100 Subject: [PATCH 05/13] fix tests --- tests/ci/test_sync_agent_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/test_sync_agent_events.py b/tests/ci/test_sync_agent_events.py index fb6500048..0dc941157 100644 --- a/tests/ci/test_sync_agent_events.py +++ b/tests/ci/test_sync_agent_events.py @@ -88,7 +88,7 @@ class TestAgentEventLifecycle: assert isinstance(step_event, CreateAgentStepEvent) assert step_event.agent_task_id == task_event.id - assert step_event.step == 2 # Step is incremented before event is emitted + assert step_event.step == 1 # Step is incremented before event is emitted assert step_event.url == httpserver.url_for('/') assert isinstance(update_event, UpdateAgentTaskEvent) From 9bd00362238028836e91ff0de3053faded7d4ac9 Mon Sep 17 00:00:00 2001 From: mertunsall Date: Sat, 2 Aug 2025 15:48:49 +0100 Subject: [PATCH 06/13] separate agent history and state --- browser_use/agent/cloud_events.py | 4 +- browser_use/agent/gif.py | 28 +++-- browser_use/agent/message_manager/service.py | 12 +- browser_use/agent/prompts.py | 37 +----- browser_use/agent/service.py | 117 +++++++++++------- browser_use/agent/views.py | 37 ++++-- browser_use/browser/views.py | 23 +++- browser_use/cli.py | 12 +- browser_use/screenshots/__init__.py | 1 + browser_use/screenshots/service.py | 48 +++++++ docs/customize/agent-settings.mdx | 2 +- docs/customize/hooks.mdx | 12 +- .../custom_hooks_before_after_step.py | 10 +- examples/features/outsource_state.py | 2 - tests/ci/test_gif_filtering.py | 25 +++- .../ci/test_gif_generation_with_navigation.py | 5 +- tests/old/test_core_functionality.py | 2 +- 17 files changed, 232 insertions(+), 145 deletions(-) create mode 100644 browser_use/screenshots/__init__.py create mode 100644 browser_use/screenshots/service.py diff --git a/browser_use/agent/cloud_events.py b/browser_use/agent/cloud_events.py index adca25a5f..7c9060e67 100644 --- a/browser_use/agent/cloud_events.py +++ b/browser_use/agent/cloud_events.py @@ -37,7 +37,7 @@ class UpdateAgentTaskEvent(BaseEvent): if not hasattr(agent, '_task_start_time'): raise ValueError('Agent must have _task_start_time attribute') - done_output = agent.state.history.final_result() if agent.state.history else None + done_output = agent.history.final_result() if agent.history else None return cls( id=str(agent.task_id), user_id='', # To be filled by cloud handler @@ -47,7 +47,7 @@ class UpdateAgentTaskEvent(BaseEvent): stopped=agent.state.stopped if hasattr(agent.state, 'stopped') else False, paused=agent.state.paused if hasattr(agent.state, 'paused') else False, done_output=done_output, - finished_at=datetime.now(timezone.utc) if agent.state.history and agent.state.history.is_done() else None, + finished_at=datetime.now(timezone.utc) if agent.history and agent.history.is_done() else None, agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {}, user_feedback_type=None, user_comment=None, diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py index 7e3730de2..a12f322a8 100644 --- a/browser_use/agent/gif.py +++ b/browser_use/agent/gif.py @@ -61,11 +61,18 @@ def create_history_gif( logger.warning('No history to create GIF from') return + # Get all screenshots from history (including None placeholders) + screenshots = history.screenshots(return_none_if_not_screenshot=True) + + if not screenshots: + logger.warning('No screenshots found in history') + return + # Find the first non-placeholder screenshot first_real_screenshot = None - for item in history.history: - if item.state.screenshot and item.state.screenshot != PLACEHOLDER_4PX_SCREENSHOT: - first_real_screenshot = item.state.screenshot + for screenshot in screenshots: + if screenshot and screenshot != PLACEHOLDER_4PX_SCREENSHOT: + first_real_screenshot = screenshot break if not first_real_screenshot: @@ -129,8 +136,9 @@ def create_history_gif( # Find the first non-placeholder screenshot for the task frame first_real_screenshot = None for item in history.history: - if item.state.screenshot and item.state.screenshot != PLACEHOLDER_4PX_SCREENSHOT: - first_real_screenshot = item.state.screenshot + screenshot_b64 = item.state.get_screenshot() + if screenshot_b64 and screenshot_b64 != PLACEHOLDER_4PX_SCREENSHOT: + first_real_screenshot = screenshot_b64 break if first_real_screenshot: @@ -146,19 +154,19 @@ def create_history_gif( else: logger.warning('No real screenshots found for task frame, skipping task frame') - # Process each history item - for i, item in enumerate(history.history, 1): - if not item.state.screenshot: + # Process each history item with its corresponding screenshot + for i, (item, screenshot) in enumerate(zip(history.history, screenshots), 1): + if not screenshot: continue # Skip placeholder screenshots from about:blank pages # These are 4x4 white PNGs encoded as a specific base64 string - if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT: + if screenshot == PLACEHOLDER_4PX_SCREENSHOT: logger.debug(f'Skipping placeholder screenshot from about:blank page at step {i}') continue # Convert base64 screenshot to PIL Image - img_data = base64.b64decode(item.state.screenshot) + img_data = base64.b64decode(screenshot) image = Image.open(io.BytesIO(img_data)) if show_goals and item.model_output: diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index 1aa96ce86..a33f7bbc6 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -9,7 +9,6 @@ from browser_use.agent.message_manager.views import ( from browser_use.agent.prompts import AgentMessagePrompt from browser_use.agent.views import ( ActionResult, - AgentHistoryList, AgentOutput, AgentStepInfo, MessageManagerState, @@ -107,7 +106,6 @@ class MessageManager: message_context: str | None = None, sensitive_data: dict[str, str | dict[str, str]] | None = None, max_history_items: int | None = None, - images_per_step: int = 1, vision_detail_level: Literal['auto', 'low', 'high'] = 'auto', include_tool_call_examples: bool = False, ): @@ -118,7 +116,6 @@ class MessageManager: self.sensitive_data_description = '' self.use_thinking = use_thinking self.max_history_items = max_history_items - self.images_per_step = images_per_step self.vision_detail_level = vision_detail_level self.include_tool_call_examples = include_tool_call_examples @@ -260,7 +257,6 @@ class MessageManager: use_vision=True, page_filtered_actions: str | None = None, sensitive_data=None, - agent_history_list: AgentHistoryList | None = None, # Pass AgentHistoryList from agent available_file_paths: list[str] | None = None, # Always pass current available_file_paths ) -> None: """Add browser state as human message""" @@ -269,14 +265,8 @@ class MessageManager: if sensitive_data: self.sensitive_data_description = self._get_sensitive_data_description(browser_state_summary.url) - # Extract previous screenshots if we need more than 1 image and have agent history + # Use only the current screenshot screenshots = [] - if agent_history_list and self.images_per_step > 1: - # Get previous screenshots and filter out None values - raw_screenshots = agent_history_list.screenshots(n_last=self.images_per_step - 1, return_none_if_not_screenshot=False) - screenshots = [s for s in raw_screenshots if s is not None] - - # add current screenshot to the end if browser_state_summary.screenshot: screenshots.append(browser_state_summary.screenshot) diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py index fe55a7969..672e97e9d 100644 --- a/browser_use/agent/prompts.py +++ b/browser_use/agent/prompts.py @@ -108,36 +108,6 @@ class AgentMessagePrompt: self.vision_detail_level = vision_detail_level assert self.browser_state - @observe_debug(ignore_input=True, ignore_output=True, name='_deduplicate_screenshots') - def _deduplicate_screenshots(self, screenshots: list[str]) -> list[str]: - """ - Remove consecutive duplicate screenshots, keeping only the most recent of each. - - Args: - screenshots: List of base64-encoded screenshot strings in chronological order (oldest first) - - Returns: - List of screenshots with consecutive duplicates removed, maintaining chronological order - """ - if not screenshots: - return [] - - if len(screenshots) == 1: - return screenshots - - # Keep track of unique screenshots by comparing each with the next one - unique_screenshots = [] - - for i in range(len(screenshots)): - # Always keep the last screenshot - if i == len(screenshots) - 1: - unique_screenshots.append(screenshots[i]) - # Only keep screenshot if it's different from the next one - elif screenshots[i] != screenshots[i + 1]: - unique_screenshots.append(screenshots[i]) - - return unique_screenshots - @observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description') def _get_browser_state_description(self) -> str: elements_text = self.browser_state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes) @@ -277,12 +247,9 @@ Available tabs: # Start with text description content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)] - # Deduplicate screenshots, keeping only the most recent of each unique image - unique_screenshots = self._deduplicate_screenshots(self.screenshots) - # Add screenshots with labels - for i, screenshot in enumerate(unique_screenshots): - if i == len(unique_screenshots) - 1: + for i, screenshot in enumerate(self.screenshots): + if i == len(self.screenshots) - 1: label = 'Current screenshot:' else: # Use simple, accurate labeling since we don't have actual step timing info diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 5ee89c9fb..3633d464e 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -3,7 +3,6 @@ import gc import inspect import json import logging -import os import re import sys import tempfile @@ -171,7 +170,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): use_thinking: bool = True, flash_mode: bool = False, max_history_items: int = 40, - images_per_step: int = 1, page_extraction_llm: BaseChatModel | None = None, planner_llm: BaseChatModel | None = None, # Deprecated planner_interval: int = 1, # Deprecated @@ -260,7 +258,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): use_thinking=use_thinking, flash_mode=flash_mode, max_history_items=max_history_items, - images_per_step=images_per_step, page_extraction_llm=page_extraction_llm, planner_llm=None, # Always None now (deprecated) planner_interval=1, # Always 1 now (deprecated) @@ -281,8 +278,19 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Initialize state self.state = injected_agent_state or AgentState() - # Initialize file system + # Initialize history + self.history = AgentHistoryList(history=[], usage=None) + + # Initialize agent directory + import time + + timestamp = int(time.time()) + base_tmp = Path(tempfile.gettempdir()) + self.agent_directory = base_tmp / f'browser_use_agent_{self.id}_{timestamp}' + + # Initialize file system and screenshot service self._set_file_system(file_system_path) + self._set_screenshot_service() # Action setup self._setup_action_models() @@ -337,7 +345,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): message_context=self.settings.message_context, sensitive_data=sensitive_data, max_history_items=self.settings.max_history_items, - images_per_step=self.settings.images_per_step, vision_detail_level=self.settings.vision_detail_level, include_tool_call_examples=self.settings.include_tool_call_examples, ) @@ -564,10 +571,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.file_system = FileSystem(file_system_path) self.file_system_path = file_system_path else: - # create a temporary file system using agent ID - base_tmp = tempfile.gettempdir() # e.g., /tmp on Unix - self.file_system_path = os.path.join(base_tmp, f'browser_use_agent_{self.id}') - self.file_system = FileSystem(self.file_system_path) + # Use the agent directory for file system + self.file_system = FileSystem(self.agent_directory) + self.file_system_path = str(self.agent_directory) except Exception as e: logger.error(f'💾 Failed to initialize file system: {e}.') raise e @@ -577,6 +583,17 @@ class Agent(Generic[Context, AgentStructuredOutput]): logger.info(f'💾 File system path: {self.file_system_path}') + def _set_screenshot_service(self) -> None: + """Initialize screenshot service using agent directory""" + try: + from browser_use.screenshots.service import ScreenshotService + + self.screenshot_service = ScreenshotService(self.agent_directory) + logger.info(f'📸 Screenshot service initialized in: {self.agent_directory}/screenshots') + except Exception as e: + logger.error(f'📸 Failed to initialize screenshot service: {e}.') + raise e + def save_file_system_state(self) -> None: """Save current file system state to agent state""" if self.file_system: @@ -731,7 +748,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): use_vision=self.settings.use_vision, page_filtered_actions=page_filtered_actions if page_filtered_actions else None, sensitive_data=self.sensitive_data, - agent_history_list=self.state.history, # Pass AgentHistoryList for screenshots available_file_paths=self.available_file_paths, # Always pass current available_file_paths ) @@ -853,7 +869,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): ) # Use _make_history_item like main branch - self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata) + await self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata) # Log step completion summary self._log_step_completion_summary(self.step_start_time, self.state.last_result) @@ -949,7 +965,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.settings.save_conversation_path_encoding, ) - def _make_history_item( + async def _make_history_item( self, model_output: AgentOutput | None, browser_state_summary: BrowserStateSummary, @@ -963,12 +979,17 @@ class Agent(Generic[Context, AgentStructuredOutput]): else: interacted_elements = [None] + # Store screenshot and get path + screenshot_path = None + if browser_state_summary.screenshot: + screenshot_path = await self.screenshot_service.store_screenshot(browser_state_summary.screenshot, self.state.n_steps) + state_history = BrowserStateHistory( url=browser_state_summary.url, title=browser_state_summary.title, tabs=browser_state_summary.tabs, interacted_element=interacted_elements, - screenshot=browser_state_summary.screenshot, + screenshot_path=screenshot_path, ) history_item = AgentHistory( @@ -978,7 +999,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): metadata=metadata, ) - self.state.history.history.append(history_item) + self.history.add_item(history_item) def _remove_think_tags(self, text: str) -> str: THINK_TAGS = re.compile(r'.*?', re.DOTALL) @@ -1096,7 +1117,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Prepare action_history data correctly action_history_data = [] - for item in self.state.history.history: + for item in self.history.history: if item.model_output and item.model_output.action: # Convert each ActionModel in the step to its dictionary representation step_actions = [ @@ -1109,7 +1130,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Append None or [] if a step had no actions or no model output action_history_data.append(None) - final_res = self.state.history.final_result() + final_res = self.history.final_result() final_result_str = json.dumps(final_res) if final_res is not None else None self.telemetry.capture( @@ -1127,13 +1148,13 @@ class Agent(Generic[Context, AgentStructuredOutput]): cdp_url=urlparse(self.browser_session.cdp_url).hostname if self.browser_session and self.browser_session.cdp_url else None, - action_errors=self.state.history.errors(), + action_errors=self.history.errors(), action_history=action_history_data, - urls_visited=self.state.history.urls(), + urls_visited=self.history.urls(), steps=self.state.n_steps, total_input_tokens=token_summary.prompt_tokens, - total_duration_seconds=self.state.history.total_duration_seconds(), - success=self.state.history.is_successful(), + total_duration_seconds=self.history.total_duration_seconds(), + success=self.history.is_successful(), final_result_response=final_result_str, error_message=agent_run_error, ) @@ -1147,13 +1168,13 @@ class Agent(Generic[Context, AgentStructuredOutput]): """ await self.step(step_info) - if self.state.history.is_done(): + if self.history.is_done(): await self.log_completion() if self.register_done_callback: if inspect.iscoroutinefunction(self.register_done_callback): - await self.register_done_callback(self.state.history) + await self.register_done_callback(self.history) else: - self.register_done_callback(self.state.history) + self.register_done_callback(self.history) return True, True return False, False @@ -1266,22 +1287,22 @@ class Agent(Generic[Context, AgentStructuredOutput]): if on_step_end is not None: await on_step_end(self) - if self.state.history.is_done(): + if self.history.is_done(): self.logger.debug(f'🎯 Task completed after {step + 1} steps!') await self.log_completion() if self.register_done_callback: if inspect.iscoroutinefunction(self.register_done_callback): - await self.register_done_callback(self.state.history) + await self.register_done_callback(self.history) else: - self.register_done_callback(self.state.history) + self.register_done_callback(self.history) # Task completed break else: agent_run_error = 'Failed to complete task in maximum steps' - self.state.history.history.append( + self.history.add_item( AgentHistory( model_output=None, result=[ActionResult(error=agent_run_error, include_in_memory=True)], @@ -1290,7 +1311,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): title='', tabs=[], interacted_element=[], - screenshot=None, + screenshot_path=None, ), metadata=None, ) @@ -1299,23 +1320,23 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.info(f'❌ {agent_run_error}') self.logger.debug('📊 Collecting usage summary...') - self.state.history.usage = await self.token_cost_service.get_usage_summary() + self.history.usage = await self.token_cost_service.get_usage_summary() # set the model output schema and call it on the fly - if self.state.history._output_model_schema is None and self.output_model_schema is not None: - self.state.history._output_model_schema = self.output_model_schema + if self.history._output_model_schema is None and self.output_model_schema is not None: + self.history._output_model_schema = self.output_model_schema self.logger.debug('🏁 Agent.run() completed successfully') - return self.state.history + return self.history except KeyboardInterrupt: # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well self.logger.info('Got KeyboardInterrupt during execution, returning current history') agent_run_error = 'KeyboardInterrupt' - self.state.history.usage = await self.token_cost_service.get_usage_summary() + self.history.usage = await self.token_cost_service.get_usage_summary() - return self.state.history + return self.history except Exception as e: self.logger.error(f'Agent run failed with exception: {e}', exc_info=True) @@ -1354,7 +1375,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Lazy import gif module to avoid heavy startup cost from browser_use.agent.gif import create_history_gif - create_history_gif(task=self.task, history=self.state.history, output_path=output_path) + create_history_gif(task=self.task, history=self.history, output_path=output_path) # Only emit output file event if GIF was actually created if Path(output_path).exists(): @@ -1479,7 +1500,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): async def log_completion(self) -> None: """Log the completion of the task""" - if self.state.history.is_successful(): + if self.history.is_successful(): self.logger.info('✅ Task completed successfully') else: self.logger.info('❌ Task completed without success') @@ -1613,7 +1634,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): """Save the history to a file""" if not file_path: file_path = 'AgentHistory.json' - self.state.history.save_to_file(file_path) + self.history.save_to_file(file_path) async def wait_until_resumed(self): await self._external_pause_event.wait() @@ -1751,14 +1772,14 @@ class Agent(Generic[Context, AgentStructuredOutput]): timestamp = datetime.now().isoformat() # Only declare variables that are used multiple times - structured_output = self.state.history.structured_output + structured_output = self.history.structured_output structured_output_json = json.dumps(structured_output.model_dump()) if structured_output else None - final_result = self.state.history.final_result() + final_result = self.history.final_result() git_info = get_git_info() - action_history = self.state.history.action_history() - action_errors = self.state.history.errors() - urls = self.state.history.urls() - usage = self.state.history.usage + action_history = self.history.action_history() + action_errors = self.history.errors() + urls = self.history.urls() + usage = self.history.usage return { 'trace': { @@ -1785,10 +1806,10 @@ class Agent(Generic[Context, AgentStructuredOutput]): 'final_result_response_truncated': ( final_result[:20000] if final_result and len(final_result) > 20000 else final_result ), - 'self_report_completed': 1 if self.state.history.is_done() else 0, - 'self_report_success': 1 if self.state.history.is_successful() else 0, - 'duration': self.state.history.total_duration_seconds(), - 'steps_taken': self.state.history.number_of_steps(), + 'self_report_completed': 1 if self.history.is_done() else 0, + 'self_report_success': 1 if self.history.is_successful() else 0, + 'duration': self.history.total_duration_seconds(), + 'steps_taken': self.history.number_of_steps(), 'usage': json.dumps(usage.model_dump()) if usage else None, }, 'trace_details': { @@ -1800,6 +1821,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): # AgentHistoryList methods 'structured_output': structured_output_json, 'final_result_response': final_result, - 'complete_history': _get_complete_history_without_screenshots(self.state.history.model_dump()), + 'complete_history': _get_complete_history_without_screenshots(self.history.model_dump()), }, } diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index 044b23af0..0ee4eeb6d 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -56,7 +56,6 @@ class AgentSettings(BaseModel): use_thinking: bool = True flash_mode: bool = False # If enabled, disables evaluation_previous_goal and next_goal, and sets use_thinking = False max_history_items: int = 40 - images_per_step: int = 1 page_extraction_llm: BaseChatModel | None = None planner_llm: BaseChatModel | None = None @@ -76,7 +75,6 @@ class AgentState(BaseModel): n_steps: int = 1 consecutive_failures: int = 0 last_result: list[ActionResult] | None = None - history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[], usage=None)) last_plan: str | None = None last_model_output: AgentOutput | None = None paused: bool = False @@ -329,6 +327,10 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]): """Representation of the AgentHistoryList object""" return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})' + def add_item(self, history_item: AgentHistory) -> None: + """Add a history item to the list""" + self.history.append(history_item) + def __repr__(self) -> str: """Representation of the AgentHistoryList object""" return self.__str__() @@ -443,20 +445,39 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]): """Get all unique URLs from history""" return [h.state.url if h.state.url is not None else None for h in self.history] - def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]: - """Get all screenshots from history""" + def screenshot_paths(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]: + """Get all screenshot paths from history""" if n_last == 0: return [] if n_last is None: if return_none_if_not_screenshot: - return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history] + return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history] else: - return [h.state.screenshot for h in self.history if h.state.screenshot is not None] + return [h.state.screenshot_path for h in self.history if h.state.screenshot_path is not None] else: if return_none_if_not_screenshot: - return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history[-n_last:]] + return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history[-n_last:]] else: - return [h.state.screenshot for h in self.history[-n_last:] if h.state.screenshot is not None] + return [h.state.screenshot_path for h in self.history[-n_last:] if h.state.screenshot_path is not None] + + def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]: + """Get all screenshots from history as base64 strings""" + if n_last == 0: + return [] + + history_items = self.history if n_last is None else self.history[-n_last:] + screenshots = [] + + for item in history_items: + screenshot_b64 = item.state.get_screenshot() + if screenshot_b64: + screenshots.append(screenshot_b64) + else: + if return_none_if_not_screenshot: + screenshots.append(None) + # If return_none_if_not_screenshot is False, we skip None values + + return screenshots def action_names(self) -> list[str]: """Get all action names from history""" diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py index 03dffe6e3..20ef0b92b 100644 --- a/browser_use/browser/views.py +++ b/browser_use/browser/views.py @@ -76,12 +76,31 @@ class BrowserStateHistory: title: str tabs: list[TabInfo] interacted_element: list[DOMHistoryElement | None] | list[None] - screenshot: str | None = None + screenshot_path: str | None = None + + def get_screenshot(self) -> str | None: + """Load screenshot from disk and return as base64 string""" + if not self.screenshot_path: + return None + + import base64 + from pathlib import Path + + path_obj = Path(self.screenshot_path) + if not path_obj.exists(): + return None + + try: + with open(path_obj, 'rb') as f: + screenshot_data = f.read() + return base64.b64encode(screenshot_data).decode('utf-8') + except Exception: + return None def to_dict(self) -> dict[str, Any]: data = {} data['tabs'] = [tab.model_dump() for tab in self.tabs] - data['screenshot'] = self.screenshot + data['screenshot_path'] = self.screenshot_path data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element] data['url'] = self.url data['title'] = self.title diff --git a/browser_use/cli.py b/browser_use/cli.py index abfc9883a..18a92d493 100644 --- a/browser_use/cli.py +++ b/browser_use/cli.py @@ -820,18 +820,18 @@ class BrowserUseApp(App): # Show token usage statistics if agent exists and has history if self.agent and hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'): # Get total tokens used - # total_tokens = self.agent.state.history.total_input_tokens() + # total_tokens = self.agent.history.total_input_tokens() # model_info.write(f'[white]Input tokens:[/] [green]{total_tokens:,}[/]') # Calculate tokens per step - num_steps = len(self.agent.state.history.history) + num_steps = len(self.agent.history.history) # if num_steps > 0: # avg_tokens_per_step = total_tokens / num_steps # model_info.write(f'[white]Avg tokens/step:[/] [green]{avg_tokens_per_step:,.1f}[/]') # Get the last step metadata to show the most recent LLM response time - if num_steps > 0 and self.agent.state.history.history[-1].metadata: - last_step = self.agent.state.history.history[-1] + if num_steps > 0 and self.agent.history.history[-1].metadata: + last_step = self.agent.history.history[-1] if last_step.metadata: step_duration = last_step.metadata.duration_seconds else: @@ -843,7 +843,7 @@ class BrowserUseApp(App): # model_info.write(f'[white]Avg tokens/sec:[/] [magenta]{tokens_per_second:.1f}[/]') # Show total duration - total_duration = self.agent.state.history.total_duration_seconds() + total_duration = self.agent.history.total_duration_seconds() if total_duration > 0: model_info.write(f'[white]Total Duration:[/] [magenta]{total_duration:.2f}s[/]') @@ -896,7 +896,7 @@ class BrowserUseApp(App): # Get all agent history items history_items = [] if hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'): - history_items = self.agent.state.history.history + history_items = self.agent.history.history if history_items: tasks_info.write('[bold yellow]STEPS:[/]') diff --git a/browser_use/screenshots/__init__.py b/browser_use/screenshots/__init__.py new file mode 100644 index 000000000..0e721412c --- /dev/null +++ b/browser_use/screenshots/__init__.py @@ -0,0 +1 @@ +# Screenshots package for browser-use diff --git a/browser_use/screenshots/service.py b/browser_use/screenshots/service.py new file mode 100644 index 000000000..b6929d83b --- /dev/null +++ b/browser_use/screenshots/service.py @@ -0,0 +1,48 @@ +""" +Screenshot storage service for browser-use agents. +""" + +import base64 +from pathlib import Path + +import anyio + + +class ScreenshotService: + """Simple screenshot storage service that saves screenshots to disk""" + + def __init__(self, agent_directory: str | Path): + """Initialize with agent directory path""" + self.agent_directory = Path(agent_directory) if isinstance(agent_directory, str) else agent_directory + + # Create screenshots subdirectory + self.screenshots_dir = self.agent_directory / 'screenshots' + self.screenshots_dir.mkdir(parents=True, exist_ok=True) + + async def store_screenshot(self, screenshot_b64: str, step_number: int) -> str: + """Store screenshot to disk and return the full path as string""" + screenshot_filename = f'step_{step_number}.png' + screenshot_path = self.screenshots_dir / screenshot_filename + + # Decode base64 and save to disk + screenshot_data = base64.b64decode(screenshot_b64) + + async with await anyio.open_file(screenshot_path, 'wb') as f: + await f.write(screenshot_data) + + return str(screenshot_path) + + async def get_screenshot(self, screenshot_path: str) -> str | None: + """Load screenshot from disk path and return as base64""" + if not screenshot_path: + return None + + path = Path(screenshot_path) + if not path.exists(): + return None + + # Load from disk and encode to base64 + async with await anyio.open_file(path, 'rb') as f: + screenshot_data = await f.read() + + return base64.b64encode(screenshot_data).decode('utf-8') diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx index 77f7b8dc1..4d7987df1 100644 --- a/docs/customize/agent-settings.mdx +++ b/docs/customize/agent-settings.mdx @@ -133,7 +133,7 @@ history = await agent.run() # Access (some) useful information history.urls() # List of visited URLs -history.screenshots() # List of screenshot paths +history.screenshot_paths() # List of screenshot paths history.action_names() # Names of executed actions history.extracted_content() # Content extracted during execution history.errors() # Any errors that occurred diff --git a/docs/customize/hooks.mdx b/docs/customize/hooks.mdx index d27673a58..b29f506a7 100644 --- a/docs/customize/hooks.mdx +++ b/docs/customize/hooks.mdx @@ -41,7 +41,7 @@ async def my_step_hook(agent: Agent): # https://playwright.dev/python/docs/api/class-page current_url = page.url - visit_log = agent.state.history.urls() + visit_log = agent.history.urls() previous_url = visit_log[-2] if len(visit_log) >= 2 else None print(f"Agent was last on URL: {previous_url} and is now on {current_url}") @@ -91,11 +91,11 @@ When working with agent hooks, you have access to the entire `Agent` instance. H - `agent.settings` contains all the configuration options passed to the `Agent(...)` at init time - `agent.llm` gives direct access to the main LLM object (e.g. `ChatOpenAI`) - `agent.state` gives access to lots of internal state, including agent thoughts, outputs, actions, etc. - - `agent.state.history.model_thoughts()`: Reasoning from Browser Use's model. - - `agent.state.history.model_outputs()`: Raw outputs from the Browsre Use's model. - - `agent.state.history.model_actions()`: Actions taken by the agent - - `agent.state.history.extracted_content()`: Content extracted from web pages - - `agent.state.history.urls()`: URLs visited by the agent + - `agent.history.model_thoughts()`: Reasoning from Browser Use's model. + - `agent.history.model_outputs()`: Raw outputs from the Browsre Use's model. + - `agent.history.model_actions()`: Actions taken by the agent + - `agent.history.extracted_content()`: Content extracted from web pages + - `agent.history.urls()`: URLs visited by the agent - `agent.browser_session` gives direct access to the `BrowserSession()` and playwright objects - `agent.browser_session.get_current_page()`: Get the current playwright `Page` object the agent is focused on - `agent.browser_session.browser_context`: Get the current playwright `BrowserContext` object diff --git a/examples/custom-functions/custom_hooks_before_after_step.py b/examples/custom-functions/custom_hooks_before_after_step.py index bd40acbbf..9053248ff 100644 --- a/examples/custom-functions/custom_hooks_before_after_step.py +++ b/examples/custom-functions/custom_hooks_before_after_step.py @@ -154,7 +154,7 @@ async def record_activity(agent_obj): print('--> History:') # Assert agent has state to satisfy type checker assert hasattr(agent_obj, 'state'), 'Agent must have state attribute' - history = agent_obj.state.history + history = agent_obj.history model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False) @@ -164,7 +164,7 @@ async def record_activity(agent_obj): # prettyprinter.cpprint(model_thoughts_last_elem) # print("--- MODEL OUTPUT ACTION ---") - model_outputs = agent_obj.state.history.model_outputs() + model_outputs = agent_obj.history.model_outputs() model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False) if len(model_outputs_json) > 0: @@ -172,7 +172,7 @@ async def record_activity(agent_obj): # prettyprinter.cpprint(model_outputs_json_last_elem) # print("--- MODEL INTERACTED ELEM ---") - model_actions = agent_obj.state.history.model_actions() + model_actions = agent_obj.history.model_actions() model_actions_json = obj_to_json(obj=model_actions, check_circular=False) if len(model_actions_json) > 0: @@ -180,14 +180,14 @@ async def record_activity(agent_obj): # prettyprinter.cpprint(model_actions_json_last_elem) # print("--- EXTRACTED CONTENT ---") - extracted_content = agent_obj.state.history.extracted_content() + extracted_content = agent_obj.history.extracted_content() extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False) if len(extracted_content_json) > 0: extracted_content_json_last_elem = extracted_content_json[-1] # prettyprinter.cpprint(extracted_content_json_last_elem) # print("--- URLS ---") - urls = agent_obj.state.history.urls() + urls = agent_obj.history.urls() # prettyprinter.cpprint(urls) urls_json = obj_to_json(obj=urls, check_circular=False) diff --git a/examples/features/outsource_state.py b/examples/features/outsource_state.py index a030c8b91..d99dbc632 100644 --- a/examples/features/outsource_state.py +++ b/examples/features/outsource_state.py @@ -47,8 +47,6 @@ async def main(): if done and valid: break - agent_state.history.history = [] - # Save state to file async with await anyio.open_file('agent_state.json', 'w') as f: serialized = agent_state.model_dump_json(exclude={'history'}) diff --git a/tests/ci/test_gif_filtering.py b/tests/ci/test_gif_filtering.py index 8c8450354..ecb6ab6dd 100644 --- a/tests/ci/test_gif_filtering.py +++ b/tests/ci/test_gif_filtering.py @@ -9,7 +9,7 @@ from PIL import Image from browser_use import AgentHistoryList from browser_use.agent.gif import create_history_gif from browser_use.agent.views import ActionResult, AgentHistory, AgentOutput -from browser_use.browser.views import PLACEHOLDER_4PX_SCREENSHOT, BrowserStateHistory, TabInfo +from browser_use.browser.views import BrowserStateHistory, TabInfo @pytest.fixture @@ -49,9 +49,22 @@ def create_test_screenshot(width: int = 800, height: int = 600, color: tuple = ( async def test_gif_filters_out_placeholder_screenshots(test_dir): """Test that 4px placeholder screenshots from about:blank pages are filtered out of GIFs.""" + # Set up screenshot service for testing (still needed to create test files) + from browser_use.screenshots.service import ScreenshotService + + screenshot_service = ScreenshotService(test_dir) + + # Helper function to store test screenshots + async def store_test_screenshot(screenshot_b64: str, step: int) -> str: + return await screenshot_service.store_screenshot(screenshot_b64, step) + # Create a history with mixed screenshots: real and placeholder history_items = [] + # Store test screenshots + real_screenshot_1_path = await store_test_screenshot(create_test_screenshot(800, 600, (100, 150, 200)), 2) + real_screenshot_2_path = await store_test_screenshot(create_test_screenshot(800, 600, (200, 100, 50)), 4) + # First item: about:blank placeholder (should be filtered) history_items.append( AgentHistory( @@ -63,7 +76,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir): ), result=[ActionResult()], state=BrowserStateHistory( - screenshot=PLACEHOLDER_4PX_SCREENSHOT, + screenshot_path=None, # Placeholder doesn't have a file path url='about:blank', title='New Tab', tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')], @@ -83,7 +96,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir): ), result=[ActionResult()], state=BrowserStateHistory( - screenshot=create_test_screenshot(800, 600, (100, 150, 200)), + screenshot_path=real_screenshot_1_path, url='https://example.com', title='Example', tabs=[TabInfo(page_id=1, url='https://example.com', title='Example')], @@ -103,7 +116,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir): ), result=[ActionResult()], state=BrowserStateHistory( - screenshot=PLACEHOLDER_4PX_SCREENSHOT, + screenshot_path=None, # Placeholder doesn't have a file path url='about:blank', title='New Tab', tabs=[TabInfo(page_id=2, url='about:blank', title='New Tab')], @@ -123,7 +136,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir): ), result=[ActionResult()], state=BrowserStateHistory( - screenshot=create_test_screenshot(800, 600, (200, 100, 50)), + screenshot_path=real_screenshot_2_path, url='https://example.com/page2', title='Page 2', tabs=[TabInfo(page_id=1, url='https://example.com/page2', title='Page 2')], @@ -190,7 +203,7 @@ async def test_gif_handles_all_placeholders(test_dir): ), result=[ActionResult()], state=BrowserStateHistory( - screenshot=PLACEHOLDER_4PX_SCREENSHOT, + screenshot_path=None, # Placeholder doesn't have a file path url='about:blank', title='New Tab', tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')], diff --git a/tests/ci/test_gif_generation_with_navigation.py b/tests/ci/test_gif_generation_with_navigation.py index fae6bc32e..31ff6dde0 100644 --- a/tests/ci/test_gif_generation_with_navigation.py +++ b/tests/ci/test_gif_generation_with_navigation.py @@ -85,9 +85,10 @@ async def test_gif_generation_with_real_navigation(httpserver, tmp_path): # Verify history contains real screenshots (not placeholders) has_real_screenshot = False for item in history.history: + screenshot_b64 = item.state.get_screenshot() if ( - item.state.screenshot - and item.state.screenshot + screenshot_b64 + and screenshot_b64 != 'iVBORw0KGgoAAAANSUhEUgAAAAQAAAAECAIAAAAmkwkpAAAAFElEQVR4nGP8//8/AwwwMSAB3BwAlm4DBfIlvvkAAAAASUVORK5CYII=' ): has_real_screenshot = True diff --git a/tests/old/test_core_functionality.py b/tests/old/test_core_functionality.py index 73cd82d42..8da0a134c 100644 --- a/tests/old/test_core_functionality.py +++ b/tests/old/test_core_functionality.py @@ -279,6 +279,6 @@ class TestCoreFunctionality: assert final_scroll_position > initial_scroll_position, 'Page did not scroll down' # Verify the action was executed - history = agent.state.history + history = agent.history action_names = history.action_names() assert 'scroll_down' in action_names From 97a75309e64b1ec8f62fb6721e73bc7b51b102f6 Mon Sep 17 00:00:00 2001 From: mertunsall Date: Sat, 2 Aug 2025 16:19:46 +0100 Subject: [PATCH 07/13] fix docs --- docs/customize/hooks.mdx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/customize/hooks.mdx b/docs/customize/hooks.mdx index b29f506a7..92c0b007b 100644 --- a/docs/customize/hooks.mdx +++ b/docs/customize/hooks.mdx @@ -91,8 +91,9 @@ When working with agent hooks, you have access to the entire `Agent` instance. H - `agent.settings` contains all the configuration options passed to the `Agent(...)` at init time - `agent.llm` gives direct access to the main LLM object (e.g. `ChatOpenAI`) - `agent.state` gives access to lots of internal state, including agent thoughts, outputs, actions, etc. +- `agent.history` gives access to historical data from the agent's execution: - `agent.history.model_thoughts()`: Reasoning from Browser Use's model. - - `agent.history.model_outputs()`: Raw outputs from the Browsre Use's model. + - `agent.history.model_outputs()`: Raw outputs from the Browser Use's model. - `agent.history.model_actions()`: Actions taken by the agent - `agent.history.extracted_content()`: Content extracted from web pages - `agent.history.urls()`: URLs visited by the agent From 90fa344b599d55c2984bd926161acfecbb681d8c Mon Sep 17 00:00:00 2001 From: Mert Unsal Date: Sat, 2 Aug 2025 16:43:09 +0100 Subject: [PATCH 08/13] Bump to 0.5.9 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d23fb5b81..c8aecbf34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "browser-use" description = "Make websites accessible for AI agents" authors = [{ name = "Gregor Zunic" }] -version = "0.5.7" +version = "0.5.9" readme = "README.md" requires-python = ">=3.11,<4.0" classifiers = [ From 9f874a805c3baa3a8a99c408f962ebbcbdde7371 Mon Sep 17 00:00:00 2001 From: mertunsall Date: Sun, 3 Aug 2025 14:17:35 +0100 Subject: [PATCH 09/13] cache the user message too, helps anthropic models --- browser_use/agent/prompts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py index 672e97e9d..36802cea6 100644 --- a/browser_use/agent/prompts.py +++ b/browser_use/agent/prompts.py @@ -269,6 +269,6 @@ Available tabs: ) ) - return UserMessage(content=content_parts) + return UserMessage(content=content_parts, cache=True) - return UserMessage(content=state_description) + return UserMessage(content=state_description, cache=True) From 17fa1a2d7697071cb736cceb48a479f31874e46c Mon Sep 17 00:00:00 2001 From: mertunsall Date: Sun, 3 Aug 2025 14:32:57 +0100 Subject: [PATCH 10/13] Add an else clause to always sleep unless HTTP 200 is received: --- browser_use/browser/session.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index a5acb5a76..9dbb1bc60 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -1022,8 +1022,12 @@ class BrowserSession(BaseModel): try: response = await client.get(f'{self.cdp_url}json/version', timeout=1.0) if response.status_code == 200: - # self.logger.debug(f'✅ Chrome CDP port {debug_port} is ready') break + else: + # FIX: Always sleep if status != 200 + if i == 0: + self.logger.debug(f'⏳ Waiting for Chrome CDP port {debug_port} to become available...') + await asyncio.sleep(0.5) except (httpx.ConnectError, httpx.TimeoutException): if i == 0: self.logger.debug(f'⏳ Waiting for Chrome CDP port {debug_port} to become available...') From 48165d21e67559a773a6e37e403c00c9348835af Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 3 Aug 2025 14:00:19 +0000 Subject: [PATCH 11/13] Remove message_context parameter from agent and message manager Co-authored-by: mailmertunsal --- browser_use/agent/message_manager/service.py | 2 -- browser_use/agent/service.py | 6 +----- browser_use/agent/views.py | 1 - docs/customize/agent-settings.mdx | 13 ------------- examples/models/deepseek-chat.py | 2 +- 5 files changed, 2 insertions(+), 22 deletions(-) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index a33f7bbc6..8126b1c4f 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -103,7 +103,6 @@ class MessageManager: state: MessageManagerState = MessageManagerState(), use_thinking: bool = True, include_attributes: list[str] | None = None, - message_context: str | None = None, sensitive_data: dict[str, str | dict[str, str]] | None = None, max_history_items: int | None = None, vision_detail_level: Literal['auto', 'low', 'high'] = 'auto', @@ -123,7 +122,6 @@ class MessageManager: # Store settings as direct attributes instead of in a settings object self.include_attributes = include_attributes or [] - self.message_context = message_context self.sensitive_data = sensitive_data self.last_input_messages = [] # Only initialize messages if state is empty diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 3633d464e..e67cb5fbd 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -162,7 +162,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): override_system_message: str | None = None, extend_system_message: str | None = None, validate_output: bool = False, - message_context: str | None = None, generate_gif: bool | str = False, available_file_paths: list[str] | None = None, include_attributes: list[str] = DEFAULT_INCLUDE_ATTRIBUTES, @@ -251,7 +250,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): override_system_message=override_system_message, extend_system_message=extend_system_message, validate_output=validate_output, - message_context=message_context, generate_gif=generate_gif, include_attributes=include_attributes, max_actions_per_step=max_actions_per_step, @@ -342,7 +340,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): use_thinking=self.settings.use_thinking, # Settings that were previously in MessageManagerSettings include_attributes=self.settings.include_attributes, - message_context=self.settings.message_context, sensitive_data=sensitive_data, max_history_items=self.settings.max_history_items, vision_detail_level=self.settings.vision_detail_level, @@ -602,8 +599,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): logger.error('💾 File system is not set up. Cannot save state.') raise ValueError('File system is not set up. Cannot save state.') - def _set_message_context(self) -> str | None: - return self.settings.message_context + def _set_browser_use_version_and_source(self, source_override: str | None = None) -> None: """Get the version from pyproject.toml and determine the source of the browser-use package""" diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index 0ee4eeb6d..e0a82621f 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -36,7 +36,6 @@ class AgentSettings(BaseModel): max_failures: int = 3 retry_delay: int = 10 validate_output: bool = False - message_context: str | None = None generate_gif: bool | str = False override_system_message: str | None = None extend_system_message: str | None = None diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx index 4d7987df1..db5aa02da 100644 --- a/docs/customize/agent-settings.mdx +++ b/docs/customize/agent-settings.mdx @@ -173,19 +173,7 @@ agent = Agent( ) ``` -## Run with message context -You can configure the agent and provide a separate message to help the LLM understand the task better. - -```python -from browser_use.llm import ChatOpenAI - -agent = Agent( - task="your task", - message_context="Additional information about the task", - llm = ChatOpenAI(model='gpt-4o') -) -``` ## Run with planner model @@ -226,7 +214,6 @@ Using a separate planner model can help: ### Optional Parameters -- `message_context`: Additional information about the task to help the LLM understand the task better. - `initial_actions`: List of initial actions to run before the main task. - `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`. - `max_failures`: Maximum number of failures before giving up. Defaults to `3`. diff --git a/examples/models/deepseek-chat.py b/examples/models/deepseek-chat.py index 0d3b86c22..cf05ceace 100644 --- a/examples/models/deepseek-chat.py +++ b/examples/models/deepseek-chat.py @@ -28,7 +28,7 @@ async def main(): task='What should we pay attention to in the recent new rules on tariffs in China-US trade?', llm=llm, use_vision=False, - message_context=extend_system_message, + extend_system_message=extend_system_message, ) await agent.run() From ba4f30327342c91864c54bf9aac40d983b17419d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 3 Aug 2025 14:02:57 +0000 Subject: [PATCH 12/13] Remove planner model documentation section Co-authored-by: mailmertunsal --- docs/customize/agent-settings.mdx | 35 ------------------------------- 1 file changed, 35 deletions(-) diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx index db5aa02da..754e01b4f 100644 --- a/docs/customize/agent-settings.mdx +++ b/docs/customize/agent-settings.mdx @@ -175,42 +175,7 @@ agent = Agent( -## Run with planner model -You can configure the agent to use a separate planner model for high-level task planning: - -```python -from browser_use.llm import ChatOpenAI - -# Initialize models -llm = ChatOpenAI(model='gpt-4o') -planner_llm = ChatOpenAI(model='o3-mini') - -agent = Agent( - task="your task", - llm=llm, - planner_llm=planner_llm, # Separate model for planning - use_vision_for_planner=False, # Disable vision for planner - planner_interval=4 # Plan every 4 steps -) -``` - -### Planner Parameters - -- `planner_llm`: A chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM. -- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`. -- `planner_interval`: Number of steps between planning phases. Defaults to `1`. - -Using a separate planner model can help: - -- Reduce costs by using a smaller model for high-level planning -- Improve task decomposition and strategic thinking -- Better handle complex, multi-step tasks - - - The planner model is optional. If not specified, the agent will not use the - planner model. - ### Optional Parameters From c39093ab6122c6e32ed5f124b983221ae84e916e Mon Sep 17 00:00:00 2001 From: mertunsall Date: Sun, 3 Aug 2025 15:04:23 +0100 Subject: [PATCH 13/13] format --- browser_use/agent/service.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index e67cb5fbd..6778e5f9f 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -599,8 +599,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): logger.error('💾 File system is not set up. Cannot save state.') raise ValueError('File system is not set up. Cannot save state.') - - def _set_browser_use_version_and_source(self, source_override: str | None = None) -> None: """Get the version from pyproject.toml and determine the source of the browser-use package""" # Use the helper function for version detection