From 461bce7b229768ce5bcaa57e46811ce977f5699d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:02:28 -0700 Subject: [PATCH 01/45] Request screenshot --- browser_use/agent/message_manager/service.py | 20 +++++++++++++++++--- browser_use/tools/service.py | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index d64108e12..2d922a125 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -305,11 +305,25 @@ class MessageManager: self.sensitive_data = effective_sensitive_data self.sensitive_data_description = self._get_sensitive_data_description(browser_state_summary.url) - # Use only the current screenshot + # Use only the current screenshot, but check if action results request screenshot inclusion screenshots = [] - if browser_state_summary.screenshot: + include_screenshot_requested = False + + # Check if any action results request screenshot inclusion + if result: + for action_result in result: + if action_result.metadata and action_result.metadata.get('include_screenshot'): + include_screenshot_requested = True + logger.debug('Screenshot inclusion requested by action result') + break + + # Include screenshot if either use_vision is True, or if explicitly requested by an action + if (use_vision or include_screenshot_requested) and browser_state_summary.screenshot: screenshots.append(browser_state_summary.screenshot) + # Override use_vision if screenshot was explicitly requested + effective_use_vision = use_vision or include_screenshot_requested + # Create single state message with all content assert browser_state_summary state_message = AgentMessagePrompt( @@ -327,7 +341,7 @@ class MessageManager: vision_detail_level=self.vision_detail_level, include_recent_events=self.include_recent_events, sample_images=self.sample_images, - ).get_user_message(use_vision) + ).get_user_message(effective_use_vision) # Set the state message with caching enabled self._set_message_with_type(state_message, 'state') diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 868aacf5b..d41d7823b 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -844,6 +844,21 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en long_term_memory=f"Tried scrolling to text '{text}' but it was not found", ) + @self.registry.action( + 'Request to include a screenshot in your next browser state. Use this when you need visual confirmation or when the page contains complex visual information that is hard to understand from the DOM alone.' + ) + async def take_screenshot(): + """Request that a screenshot be included in the next observation""" + memory = 'Requested screenshot for next observation' + msg = f'📸 {memory}' + logger.info(msg) + + # Return flag in metadata to signal that screenshot should be included + return ActionResult( + extracted_content=memory, + metadata={'include_screenshot': True}, + ) + # Dropdown Actions @self.registry.action( From 889efd3ee29b977e73728f128803a069940777b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:59:05 -0700 Subject: [PATCH 02/45] System prompt --- browser_use/agent/system_prompt.md | 5 +++-- browser_use/agent/system_prompt_flash.md | 8 ++++++-- browser_use/agent/system_prompt_no_thinking.md | 5 +++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md index 2c96badef..3944bcf8d 100644 --- a/browser_use/agent/system_prompt.md +++ b/browser_use/agent/system_prompt.md @@ -20,7 +20,7 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before 5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. @@ -66,8 +66,9 @@ Note that: -You will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. +If you used take_screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot. +Use take_screenshot if you are unsure or simply want more information. diff --git a/browser_use/agent/system_prompt_flash.md b/browser_use/agent/system_prompt_flash.md index c8d3feaa3..32795764f 100644 --- a/browser_use/agent/system_prompt_flash.md +++ b/browser_use/agent/system_prompt_flash.md @@ -20,7 +20,7 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before 5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. @@ -64,10 +64,14 @@ Note that: -You will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. +If you used take_screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot. +Use take_screenshot if you are unsure or simply want more information. + + + Strictly follow these rules while using the browser and navigating the web: - Only interact with elements that have a numeric [index] assigned. diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md index f51bcd171..fdfaf57c3 100644 --- a/browser_use/agent/system_prompt_no_thinking.md +++ b/browser_use/agent/system_prompt_no_thinking.md @@ -20,7 +20,7 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before 5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. @@ -66,8 +66,9 @@ Note that: -You will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. +If you used take_screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot. +Use take_screenshot if you are unsure or simply want more information. From e9e19f785de6fbd887b46c7ef9e378ffd8220b7c Mon Sep 17 00:00:00 2001 From: Prakhar Jain Date: Sat, 4 Oct 2025 20:46:41 +0530 Subject: [PATCH 03/45] added option to interactive elements --- browser_use/dom/serializer/serializer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/dom/serializer/serializer.py b/browser_use/dom/serializer/serializer.py index 1a515379d..e57cee5fc 100644 --- a/browser_use/dom/serializer/serializer.py +++ b/browser_use/dom/serializer/serializer.py @@ -677,7 +677,7 @@ class DOMTreeSerializer: # 5. Keep if has role suggesting interactivity if node.original_node.attributes: role = node.original_node.attributes.get('role') - if role in ['button', 'link', 'checkbox', 'radio', 'tab', 'menuitem']: + if role in ['button', 'link', 'checkbox', 'radio', 'tab', 'menuitem', 'option']: return False # Default: exclude this child From d3abbcb2c741645ba07bf3bbb2402ce0a45a2b2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sat, 4 Oct 2025 12:56:40 -0700 Subject: [PATCH 04/45] Include use_vision auto --- browser_use/agent/message_manager/service.py | 24 ++++++++++++++----- browser_use/agent/service.py | 8 ++++--- browser_use/agent/system_prompt.md | 2 +- browser_use/agent/system_prompt_flash.md | 2 +- .../agent/system_prompt_no_thinking.md | 2 +- browser_use/agent/views.py | 4 ++-- browser_use/telemetry/views.py | 4 ++-- docs/customize/agent/all-parameters.mdx | 2 +- docs/customize/tools/available.mdx | 3 +++ 9 files changed, 34 insertions(+), 17 deletions(-) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index 2d922a125..521e10c64 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Literal +from typing import Literal, Union from browser_use.agent.message_manager.views import ( HistoryItem, @@ -285,7 +285,7 @@ class MessageManager: model_output: AgentOutput | None = None, result: list[ActionResult] | None = None, step_info: AgentStepInfo | None = None, - use_vision=True, + use_vision: Union[bool, Literal['auto']] = 'auto', page_filtered_actions: str | None = None, sensitive_data=None, available_file_paths: list[str] | None = None, # Always pass current available_file_paths @@ -317,12 +317,24 @@ class MessageManager: logger.debug('Screenshot inclusion requested by action result') break - # Include screenshot if either use_vision is True, or if explicitly requested by an action - if (use_vision or include_screenshot_requested) and browser_state_summary.screenshot: + # Handle different use_vision modes: + # - "auto": Only include screenshot if explicitly requested by action (e.g., take_screenshot) + # - True: Always include screenshot + # - False: Never include screenshot + include_screenshot = False + if use_vision is True: + # Always include screenshot when use_vision=True + include_screenshot = True + elif use_vision == 'auto': + # Only include screenshot if explicitly requested by action when use_vision="auto" + include_screenshot = include_screenshot_requested + # else: use_vision is False, never include screenshot (include_screenshot stays False) + + if include_screenshot and browser_state_summary.screenshot: screenshots.append(browser_state_summary.screenshot) - # Override use_vision if screenshot was explicitly requested - effective_use_vision = use_vision or include_screenshot_requested + # Use vision in the user message if screenshots are included + effective_use_vision = len(screenshots) > 0 # Create single state message with all content assert browser_state_summary diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index c9d314a4f..21c65edbc 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -9,7 +9,7 @@ import time from collections.abc import Awaitable, Callable from datetime import datetime from pathlib import Path -from typing import Any, Generic, Literal, TypeVar +from typing import Any, Generic, Literal, TypeVar, Union from urllib.parse import urlparse from dotenv import load_dotenv @@ -154,7 +154,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): register_should_stop_callback: Callable[[], Awaitable[bool]] | None = None, # Agent settings output_model_schema: type[AgentStructuredOutput] | None = None, - use_vision: bool = True, + use_vision: Union[bool, Literal['auto']] = 'auto', save_conversation_path: str | Path | None = None, save_conversation_path_encoding: str | None = 'utf-8', max_failures: int = 3, @@ -255,7 +255,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): elif controller is not None: self.tools = controller else: - self.tools = Tools(display_files_in_done_text=display_files_in_done_text) + # Exclude take_screenshot tool when use_vision=False + exclude_actions = ['take_screenshot'] if use_vision is False else [] + self.tools = Tools(exclude_actions=exclude_actions, display_files_in_done_text=display_files_in_done_text) # Structured output self.output_model_schema = output_model_schema diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md index 3944bcf8d..b48a68cb6 100644 --- a/browser_use/agent/system_prompt.md +++ b/browser_use/agent/system_prompt.md @@ -20,7 +20,7 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before, this will contain a screenshot. 5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. diff --git a/browser_use/agent/system_prompt_flash.md b/browser_use/agent/system_prompt_flash.md index 32795764f..9f14aad29 100644 --- a/browser_use/agent/system_prompt_flash.md +++ b/browser_use/agent/system_prompt_flash.md @@ -20,7 +20,7 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before, this will contain a screenshot. 5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md index fdfaf57c3..c905e5bb6 100644 --- a/browser_use/agent/system_prompt_no_thinking.md +++ b/browser_use/agent/system_prompt_no_thinking.md @@ -20,7 +20,7 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before, this will contain a screenshot. 5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index daa6ee78d..c433935b3 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -5,7 +5,7 @@ import logging import traceback from dataclasses import dataclass from pathlib import Path -from typing import Any, Generic, Literal +from typing import Any, Generic, Literal, Union from openai import RateLimitError from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model, model_validator @@ -33,7 +33,7 @@ logger = logging.getLogger(__name__) class AgentSettings(BaseModel): """Configuration options for the Agent""" - use_vision: bool = True + use_vision: Union[bool, Literal['auto']] = 'auto' vision_detail_level: Literal['auto', 'low', 'high'] = 'auto' save_conversation_path: str | Path | None = None save_conversation_path_encoding: str | None = 'utf-8' diff --git a/browser_use/telemetry/views.py b/browser_use/telemetry/views.py index a5fd8ee83..48b3a8e6f 100644 --- a/browser_use/telemetry/views.py +++ b/browser_use/telemetry/views.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import Sequence from dataclasses import asdict, dataclass -from typing import Any +from typing import Any, Literal, Union from browser_use.config import is_running_in_docker @@ -29,7 +29,7 @@ class AgentTelemetryEvent(BaseTelemetryEvent): model_provider: str max_steps: int max_actions_per_step: int - use_vision: bool + use_vision: Union[bool, Literal['auto']] version: str source: str cdp_url: str | None diff --git a/docs/customize/agent/all-parameters.mdx b/docs/customize/agent/all-parameters.mdx index 05f6e217e..0f31dbb6e 100644 --- a/docs/customize/agent/all-parameters.mdx +++ b/docs/customize/agent/all-parameters.mdx @@ -13,7 +13,7 @@ mode: "wide" - `output_model_schema`: Pydantic model class for structured output validation. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) ### Vision & Processing -- `use_vision` (default: `True`): Enable/disable vision capabilities for processing screenshots +- `use_vision` (default: `"auto"`): Vision mode - `"auto"` includes take_screenshot tool but only uses vision when requested, `True` always includes screenshots, `False` never includes screenshots and excludes take_screenshot tool - `vision_detail_level` (default: `'auto'`): Screenshot detail level - `'low'`, `'high'`, or `'auto'` - `page_extraction_llm`: Separate LLM model for page content extraction. You can choose a small & fast model because it only needs to extract text from the page (default: same as `llm`) diff --git a/docs/customize/tools/available.mdx b/docs/customize/tools/available.mdx index 1bd6f4dfe..c045f3dca 100644 --- a/docs/customize/tools/available.mdx +++ b/docs/customize/tools/available.mdx @@ -32,6 +32,9 @@ mode: "wide" ### Content Extraction - **`extract_structured_data`** - Extract data from webpages using LLM +### Visual Analysis +- **`take_screenshot`** - Request a screenshot in your next browser state for visual confirmation + ### Form Controls - **`get_dropdown_options`** - Get dropdown option values - **`select_dropdown_option`** - Select dropdown options From d01447499884d222c037204f90b0bfe2e1566f95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sat, 4 Oct 2025 12:59:11 -0700 Subject: [PATCH 05/45] Linter --- browser_use/agent/message_manager/service.py | 4 ++-- browser_use/agent/service.py | 4 ++-- browser_use/agent/views.py | 4 ++-- browser_use/telemetry/views.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index 521e10c64..b8abcfc3f 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Literal, Union +from typing import Literal from browser_use.agent.message_manager.views import ( HistoryItem, @@ -285,7 +285,7 @@ class MessageManager: model_output: AgentOutput | None = None, result: list[ActionResult] | None = None, step_info: AgentStepInfo | None = None, - use_vision: Union[bool, Literal['auto']] = 'auto', + use_vision: bool | Literal['auto'] = 'auto', page_filtered_actions: str | None = None, sensitive_data=None, available_file_paths: list[str] | None = None, # Always pass current available_file_paths diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 21c65edbc..6b0a2fa0d 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -9,7 +9,7 @@ import time from collections.abc import Awaitable, Callable from datetime import datetime from pathlib import Path -from typing import Any, Generic, Literal, TypeVar, Union +from typing import Any, Generic, Literal, TypeVar from urllib.parse import urlparse from dotenv import load_dotenv @@ -154,7 +154,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): register_should_stop_callback: Callable[[], Awaitable[bool]] | None = None, # Agent settings output_model_schema: type[AgentStructuredOutput] | None = None, - use_vision: Union[bool, Literal['auto']] = 'auto', + use_vision: bool | Literal['auto'] = 'auto', save_conversation_path: str | Path | None = None, save_conversation_path_encoding: str | None = 'utf-8', max_failures: int = 3, diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index c433935b3..a8ae7f3c4 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -5,7 +5,7 @@ import logging import traceback from dataclasses import dataclass from pathlib import Path -from typing import Any, Generic, Literal, Union +from typing import Any, Generic, Literal from openai import RateLimitError from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model, model_validator @@ -33,7 +33,7 @@ logger = logging.getLogger(__name__) class AgentSettings(BaseModel): """Configuration options for the Agent""" - use_vision: Union[bool, Literal['auto']] = 'auto' + use_vision: bool | Literal['auto'] = 'auto' vision_detail_level: Literal['auto', 'low', 'high'] = 'auto' save_conversation_path: str | Path | None = None save_conversation_path_encoding: str | None = 'utf-8' diff --git a/browser_use/telemetry/views.py b/browser_use/telemetry/views.py index 48b3a8e6f..486842f8e 100644 --- a/browser_use/telemetry/views.py +++ b/browser_use/telemetry/views.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import Sequence from dataclasses import asdict, dataclass -from typing import Any, Literal, Union +from typing import Any, Literal from browser_use.config import is_running_in_docker @@ -29,7 +29,7 @@ class AgentTelemetryEvent(BaseTelemetryEvent): model_provider: str max_steps: int max_actions_per_step: int - use_vision: Union[bool, Literal['auto']] + use_vision: bool | Literal['auto'] version: str source: str cdp_url: str | None From 3bbdcb1e976dfcd0582970c6e766674612129813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sat, 4 Oct 2025 13:05:30 -0700 Subject: [PATCH 06/45] bump-anthropic-version-for-linter --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e369a33d4..42e24718d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "authlib>=1.6.0", "google-genai>=1.29.0,<2.0.0", "openai>=1.99.2,<2.0.0", - "anthropic>=0.58.2,<1.0.0", + "anthropic>=0.68.1,<1.0.0", "groq>=0.30.0", "ollama>=0.5.1", "google-api-python-client>=2.174.0", From 50fb58284a472fcefeabd958c26af53081312ed4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 00:00:28 -0700 Subject: [PATCH 07/45] short-systemprompt --- browser_use/agent/system_prompt_flash.md | 168 ----------------------- 1 file changed, 168 deletions(-) diff --git a/browser_use/agent/system_prompt_flash.md b/browser_use/agent/system_prompt_flash.md index 9f14aad29..b1ee5da06 100644 --- a/browser_use/agent/system_prompt_flash.md +++ b/browser_use/agent/system_prompt_flash.md @@ -1,174 +1,7 @@ You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in . - -You excel at following tasks: -1. Navigating complex websites and extracting precise information -2. Automating form submissions and interactive web actions -3. Gathering and saving information -4. Using your filesystem effectively to decide what to keep in your context -5. Operate effectively in an agent loop -6. Efficiently performing diverse web tasks - - -- Default working language: **English** -- Always respond in the same language as the user request - - - -At every step, your input will consist of: -1. : A chronological event stream including your previous actions and their results. -2. : Current , summary of , , and . -3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before, this will contain a screenshot. -5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. - - - -Agent history will be given as a list of step information as follows: - -: -Memory: Your memory / thinking of this step -Action Results: Your actions and their results - - -and system messages wrapped in tag. - - - -USER REQUEST: This is your ultimate objective and always remains visible. -- This has the highest priority. Make the user happy. -- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps. -- If the task is open ended you can plan yourself how to get it done. - - - -1. Browser State will be given as: - -Current URL: URL of the page you are currently viewing. -Open Tabs: Open tabs with their ids. Interactive Elements: All interactive elements will be provided in format as [index]text where -- index: Numeric identifier for interaction -- type: HTML element type (button, input, etc.) -- text: Element description - -Examples: -[33]
User form
-\t*[35] - -Note that: -- Only elements with numeric indexes in [] are interactive -- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index) -- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list. -- Pure text elements without [] are not interactive. -
- - -If you used take_screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. -If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot. -Use take_screenshot if you are unsure or simply want more information. - - - - - - -Strictly follow these rules while using the browser and navigating the web: -- Only interact with elements that have a numeric [index] assigned. -- Only use indexes that are explicitly provided. -- If research is needed, open a **new tab** instead of reusing the current one. -- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list. -- By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. -- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages). -- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). -- If expected elements are missing, try refreshing, scrolling, or navigating back. -- If the page is not fully loaded, use the wait action. -- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible. -- Call extract_structured_data only if the information you are looking for is not visible in your otherwise always just use the needed text from the . -- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. -- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. -- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step. -- If the includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient. -- The is the ultimate goal. If the user specifies explicit steps, they have always the highest priority. -- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion. -- Don't login into a page if you don't have to. Don't login if you don't have the credentials. -- There are 2 types of tasks always first think which type of request you are dealing with: -1. Very specific step by step instructions: -- Follow them as very precise and don't skip steps. Try to complete everything as requested. -2. Open ended tasks. Plan yourself, be creative in achieving them. -- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search. -- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in . You can either read the file or scroll in the page to see more. - - - -- You have access to a persistent file system which you can use to track progress, store results, and manage long tasks. -- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file_str` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task. -- If you are writing a `csv` file, make sure to use double quotes if cell elements contain commas. -- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary. -- If exists, includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access. -- If the task is really long, initialize a `results.md` file to accumulate your results. -- DO NOT use the file system if the task is less than 10 steps! - - - -You must call the `done` action in one of two cases: -- When you have fully completed the USER REQUEST. -- When you reach the final allowed step (`max_steps`), even if the task is incomplete. -- If it is ABSOLUTELY IMPOSSIBLE to continue. - -The `done` action is your opportunity to terminate and share your findings with the user. -- Set `success` to `true` only if the full USER REQUEST has been completed with no missing components. -- If any part of the request is missing, incomplete, or uncertain, set `success` to `false`. -- You can use the `text` field of the `done` action to communicate your findings and `files_to_display` to send file attachments to the user, e.g. `["results.md"]`. -- Put ALL the relevant information you found so far in the `text` field when you call `done` action. -- Combine `text` and `files_to_display` to provide a coherent reply to the user and fulfill the USER REQUEST. -- You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions. -- If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer. -- If the user asks for a structured output, your `done` action's schema will be modified. Take this schema into account when solving the task! - - - -- You are allowed to use a maximum of {max_actions} actions per step. - -If you are allowed multiple actions, you can specify multiple actions in the list to be executed sequentially (one after another). -- If the page changes after an action, the sequence is interrupted and you get the new state. You can see this in your agent history when this happens. - - - -You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page. - -**Recommended Action Combinations:** -- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step -- `input_text` + `input_text` → Fill multiple form fields -- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks) -- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data -- File operations + browser actions - -Do not try multiple different paths in one step. Always have one clear goal per step. -Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g. -- do not use click_element_by_index and then go_to_url, because you would not see if the click was successful or not. -- or do not use switch_tab and switch_tab together, because you would not see the state in between. -- do not use input_text and then scroll, because you would not see if the input text was successful or not. - - - -Be clear and concise in your decision-making. Exhibit the following reasoning patterns to successfully achieve the : -- Reason about to track progress and context toward . -- Analyze the most recent "Next Goal" and "Action Result" in and clearly state what you previously tried to achieve. -- Analyze all relevant items in , , , , and the screenshot to understand your state. -- Explicitly judge success/failure/uncertainty of the last action. Never assume an action succeeded just because it appears to be executed in your last step in . For example, you might have "Action 1/1: Input '2025-05-05' into element 3." in your history even though inputting text failed. Always verify using (screenshot) as the primary ground truth. If a screenshot is unavailable, fall back to . If the expected change is missing, mark the last action as failed (or uncertain) and plan a recovery. -- If todo.md is empty and the task is multi-step, generate a stepwise plan in todo.md using file tools. -- Analyze `todo.md` to guide and track your progress. -- If any todo.md items are finished, mark them as complete in the file. -- Analyze whether you are stuck, e.g. when you repeat the same actions multiple times without any progress. Then consider alternative approaches e.g. scrolling for more context or send_keys to interact with keys directly or different pages. -- Analyze the where one-time information are displayed due to your previous action. Reason about whether you want to keep this information in memory and plan writing them into a file if applicable using the file tools. -- If you see information relevant to , plan saving the information into a file. -- Before writing data into a file, analyze the and check if the file already has some content to avoid overwriting. -- Decide what concise, actionable context should be stored in memory to inform future reasoning. -- When ready to finish, state you are preparing to call done and communicate completion/results to the user. -- Before done, use read_file to verify file contents intended for user output. -- Always reason about the . Make sure to carefully analyze the specific steps and information required. E.g. specific filters, specific form fields, specific information to search. Make sure to always compare the current trajactory with the user request and think carefully if thats how the user requested it. - You must respond with a valid JSON in this exact format: @@ -177,5 +10,4 @@ You must respond with a valid JSON in this exact format: "action":[{{"go_to_url": {{ "url": "url_value"}}}}] }} -Action list should NEVER be empty. From 9a655e56c51d6abb991dc6e749ba791940466698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 11:09:15 -0700 Subject: [PATCH 08/45] Shorter tool description --- .gitignore | 2 + browser_use/tools/service.py | 123 ++++++++++------------------------- 2 files changed, 37 insertions(+), 88 deletions(-) diff --git a/.gitignore b/.gitignore index 3f66345ea..b2c71bb34 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,5 @@ screenshot.png all_github_issues_progress.md all_github_issues.md + +todo-input-token.md diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index d41d7823b..5815b618d 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -115,7 +115,7 @@ class Tools(Generic[Context]): # Basic Navigation Actions @self.registry.action( - 'Search a query with search engine which defaults to DuckDuckGo. Dont specify search_engine unless user asks for different search engine. Available search engines: duckduckgo, google, bing.', + 'Search query (defaults DuckDuckGo). Options: duckduckgo, google, bing.', param_model=SearchAction, ) async def search(params: SearchAction, browser_session: BrowserSession): @@ -158,7 +158,7 @@ class Tools(Generic[Context]): return ActionResult(error=f'Failed to search {params.search_engine} for "{params.query}": {str(e)}') @self.registry.action( - 'Navigate to URL, optionally set new_tab=True to open in new tab, otherwise default is False.', + 'Navigate to URL. Set new_tab=True to open in new tab.', param_model=GoToUrlAction, ) async def go_to_url(params: GoToUrlAction, browser_session: BrowserSession): @@ -218,9 +218,7 @@ class Tools(Generic[Context]): error_msg = f'Failed to go back: {str(e)}' return ActionResult(error=error_msg) - @self.registry.action( - 'Wait for x seconds (default 3) (max 30 seconds). This can be used to wait until the page is fully loaded.' - ) + @self.registry.action('Wait x seconds (default 3, max 30). ') async def wait(seconds: int = 3): # Cap wait time at maximum 30 seconds # Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds @@ -236,7 +234,7 @@ class Tools(Generic[Context]): # Element Interaction Actions @self.registry.action( - 'Click an element by index. Only indices from your browser_state are allowed. Never use an index that is not inside your current browser_state. Optionally set ctrl=True to open any resulting navigation in a new tab.', + 'Click element by index from browser_state. Set ctrl=True to open in new tab.', param_model=ClickElementAction, ) async def click(params: ClickElementAction, browser_session: BrowserSession): @@ -290,7 +288,7 @@ class Tools(Generic[Context]): return ActionResult(error=error_msg) @self.registry.action( - 'Input text into an input interactive element. Only input text into indices that are inside your current browser_state and are valid input fields.', + 'Input text into element by index from browser_state.', param_model=InputTextAction, ) async def input_text( @@ -352,7 +350,7 @@ class Tools(Generic[Context]): return ActionResult(error=error_msg) @self.registry.action( - 'Upload file to interactive element with file path. Only upload files to indices that are inside your current browser_state and are valid file upload fields.', + 'Upload file to element by index from browser_state.', param_model=UploadFileAction, ) async def upload_file( @@ -503,7 +501,7 @@ class Tools(Generic[Context]): # Tab Management Actions - @self.registry.action('Switch to tab with tab_id.', param_model=SwitchTabAction) + @self.registry.action('Switch to tab by tab_id.', param_model=SwitchTabAction) async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession): # Simple switch tab logic try: @@ -525,7 +523,7 @@ class Tools(Generic[Context]): memory = f'Attempted to switch to tab #{params.tab_id}' return ActionResult(extracted_content=memory, long_term_memory=memory) - @self.registry.action('Close an existing tab', param_model=CloseTabAction) + @self.registry.action('Close tab by tab_id.', param_model=CloseTabAction) async def close_tab(params: CloseTabAction, browser_session: BrowserSession): # Simple close tab logic try: @@ -557,20 +555,11 @@ class Tools(Generic[Context]): # This action is temporarily disabled as it needs refactoring to use events @self.registry.action( - """This tool sends the markdown of the current page with the query to an LLM to extract structured, semantic data (e.g. product description, price, all information about XYZ) from the markdown of the current webpage based on a query. -Only use when: -- You are sure that you are on the right page for the query -- You know exactly the information you need to extract from the page -- You did not previously call this tool on the same page -You can not use this tool to: -- Get interactive elements like buttons, links, dropdowns, menus, etc. -- If you previously asked extract_structured_data on the same page with the same query, you should not call it again. - -Set extract_links=True only if your query requires extracting links/URLs from the page. -Use start_from_char to start extraction from a specific character position (use if extraction was previously truncated and you want more content). - -If this tool does not return the desired outcome, do not call it again, use scroll_to_text or scroll to find the desired information. -""", + """Extract semantic data from page markdown via LLM query (e.g. product info, prices). +Use when: on right page, know what to extract, haven't used on same page before. +Can't get: interactive elements (buttons, links, dropdowns). +Set extract_links=True for URLs. Use start_from_char if truncated. +If fails, use scroll_to_text or scroll instead.""", ) async def extract_structured_data( query: str, @@ -690,11 +679,9 @@ You will be given a query and the markdown of a webpage that has been filtered t raise RuntimeError(str(e)) @self.registry.action( - """Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 10.0 for ten pages, etc.). -Default behavior is to scroll by one page. This is enough for most cases. -Optionally, if there are multiple scroll containers, use frame_element_index parameter with an element inside the container you want to scroll in. For that you must use indices that exist in your browser_state (works well for dropdowns and custom UI components). -If you need to get to the bottom of the page, use a high number of pages at once like 10 to get to the bottom of the page. -Note: For multiple pages (>=1.0), scrolls are performed one page at a time to ensure reliability. Page height is detected from viewport, fallback is 1000px per page.""", + """Scroll page by num_pages (down=True for down, False for up). Default 1 page, use 0.5 for half, 10 for bottom. +For specific containers, use frame_element_index from browser_state (works with dropdowns, custom UI). +Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px fallback.""", param_model=ScrollAction, ) async def scroll(params: ScrollAction, browser_session: BrowserSession): @@ -803,7 +790,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en return ActionResult(error=error_msg) @self.registry.action( - 'Send strings of special keys to use e.g. Escape, Backspace, Insert, PageDown, Delete, Enter, or Shortcuts such as `Control+o`, `Control+Shift+T`', + 'Send special keys (Escape, Enter, PageDown) or shortcuts (Control+o, Control+Shift+T).', param_model=SendKeysAction, ) async def send_keys(params: SendKeysAction, browser_session: BrowserSession): @@ -822,7 +809,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en return ActionResult(error=error_msg) @self.registry.action( - description='Scroll to a text in the current page. This helps you to be efficient. Prefer this tool over scrolling step by step if you know what to scroll to.', + description='Scroll to text on page. Prefer over step-by-step scrolling when target known.', ) async def scroll_to_text(text: str, browser_session: BrowserSession): # type: ignore # Dispatch scroll to text event @@ -844,9 +831,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en long_term_memory=f"Tried scrolling to text '{text}' but it was not found", ) - @self.registry.action( - 'Request to include a screenshot in your next browser state. Use this when you need visual confirmation or when the page contains complex visual information that is hard to understand from the DOM alone.' - ) + @self.registry.action('Request screenshot in next browser state. Use for visual confirmation or complex visual content.') async def take_screenshot(): """Request that a screenshot be included in the next observation""" memory = 'Requested screenshot for next observation' @@ -862,7 +847,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en # Dropdown Actions @self.registry.action( - 'Get list of values for a dropdown input field. Only works on dropdown-style form elements (, ARIA select). Only for dropdown elements.', param_model=GetDropdownOptionsAction, ) async def get_dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession): @@ -888,7 +873,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en ) @self.registry.action( - 'Select dropdown option by exact text from any dropdown type (native , ARIA, custom). Searches element and children.', param_model=SelectDropdownOptionAction, ) async def select_dropdown_option(params: SelectDropdownOptionAction, browser_session: BrowserSession): @@ -931,9 +916,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en return ActionResult(error=error_msg) # File System Actions - @self.registry.action( - 'Write or append content to file_name in file system. Allowed extensions are .md, .txt, .json, .csv, .pdf. For .pdf files, write the content in markdown format and it will automatically be converted to a properly formatted PDF document.' - ) + @self.registry.action('Write/append to file (.md, .txt, .json, .csv, .pdf). PDF: write markdown, auto-converts to PDF.') async def write_file( file_name: str, content: str, @@ -954,14 +937,14 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en return ActionResult(extracted_content=result, long_term_memory=result) @self.registry.action( - 'Replace old_str with new_str in file_name. old_str must exactly match the string to replace in original text. Recommended tool to mark completed items in todo.md or change specific contents in a file.' + 'Replace old_str with new_str in file. old_str must match exactly. For todo.md updates or specific edits.' ) async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem): result = await file_system.replace_file_str(file_name, old_str, new_str) logger.info(f'💾 {result}') return ActionResult(extracted_content=result, long_term_memory=result) - @self.registry.action('Read file_name from file system') + @self.registry.action('Read file from file system.') async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem): if available_file_paths and file_name in available_file_paths: result = await file_system.read_file(file_name, external_file=True) @@ -991,58 +974,22 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en ) @self.registry.action( - """This JavaScript code gets executed with Runtime.evaluate and 'returnByValue': True, 'awaitPromise': True + """Execute JavaScript with Runtime.evaluate (returnByValue:true, awaitPromise:true). -SYNTAX RULES - FAILURE TO FOLLOW CAUSES "Uncaught at line 0" ERRORS: -- ALWAYS wrap your code in IIFE: (function(){ ... })() or (async function(){ ... })() for async code -- ALWAYS add try-catch blocks to prevent execution errors -- ALWAYS use proper semicolons and valid JavaScript syntax -- NEVER write multiline code without proper IIFE wrapping -- ALWAYS validate elements exist before accessing them +SYNTAX RULES - ALWAYS wrap in IIFE or get "Uncaught at line 0": +- IIFE: (function(){ ... })() or async: (async function(){ ... })() +- Add try-catch, proper semicolons, validate elements exist -EXAMPLES: -Use this tool when other tools do not work on the first try as expected or when a more general tool is needed, e.g. for filling a form all at once, hovering, dragging, extracting only links, extracting content from the page, press and hold, hovering, clicking on coordinates, zooming, use this if the user provides custom selectors which you can otherwise not interact with .... -You can also use it to explore the website. -- Write code to solve problems you could not solve with other tools. -- Don't write comments in here, no human reads that. -- Write only valid js code. -- use this to e.g. extract + filter links, convert the page to json into the format you need etc... +Use when other tools fail or need custom logic (forms, hover, drag, extract links, custom selectors, shadow DOM, React/Vue/Angular, etc.). - -- limit the output otherwise your context will explode -- think if you deal with special elements like iframes / shadow roots etc -- Adopt your strategy for React Native Web, React, Angular, Vue, MUI pages etc. -- e.g. with synthetic events, keyboard simulation, shadow DOM, etc. - -PROPER SYNTAX EXAMPLES: +Examples: CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })() CORRECT: (async function(){ try { await new Promise(r => setTimeout(r, 100)); return 'done'; } catch(e) { return 'Error: ' + e.message; } })() +WRONG: document.querySelector('#id').value (no IIFE) -WRONG: const el = document.querySelector('#id'); el ? el.value : ''; -WRONG: document.querySelector('#id').value -WRONG: Multiline code without IIFE wrapping +Shadow DOM: (function(){ try { const hosts = document.querySelectorAll('*'); for (let host of hosts) { if (host.shadowRoot) { const el = host.shadowRoot.querySelector('#target'); if (el) return el.textContent; } } return 'Not found'; } catch(e) { return 'Error: ' + e.message; } })() -SHADOW DOM ACCESS EXAMPLE: -(function(){ - try { - const hosts = document.querySelectorAll('*'); - for (let host of hosts) { - if (host.shadowRoot) { - const el = host.shadowRoot.querySelector('#target'); - if (el) return el.textContent; - } - } - return 'Not found'; - } catch(e) { - return 'Error: ' + e.message; - } -})() - -## Return values: -- Async functions (with await, promises, timeouts) are automatically handled -- Returns strings, numbers, booleans, and serialized objects/arrays -- Use JSON.stringify() for complex objects: JSON.stringify(Array.from(document.querySelectorAll('a')).map(el => el.textContent.trim())) -""", +Returns strings, numbers, booleans, objects/arrays. Use JSON.stringify() for complex objects.""", ) async def execute_js(code: str, browser_session: BrowserSession): # Execute JavaScript with proper error handling and promise support @@ -1207,7 +1154,7 @@ SHADOW DOM ACCESS EXAMPLE: self.display_files_in_done_text = display_files_in_done_text @self.registry.action( - 'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached', + 'Complete task with return text. Set success=True if finished, False if not (e.g. max steps reached).', param_model=StructuredOutputAction[output_model], ) async def done(params: StructuredOutputAction): @@ -1229,7 +1176,7 @@ SHADOW DOM ACCESS EXAMPLE: else: @self.registry.action( - 'Complete task - provide a summary of results for the user. Set success=True if task completed successfully, false otherwise. Text should be your response to the user summarizing results. Include files you would like to display to the user in files_to_display.', + 'Complete task with summary. Set success=True if completed successfully. Include files in files_to_display.', param_model=DoneAction, ) async def done(params: DoneAction, file_system: FileSystem): From 22acf58424acd985a9932d9a2a1176823fa5bb00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 11:22:54 -0700 Subject: [PATCH 09/45] Shorter tools --- browser_use/tools/service.py | 65 +++++++++++++----------------------- browser_use/tools/views.py | 52 ++++++++++++----------------- 2 files changed, 44 insertions(+), 73 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 5815b618d..4c3b883f0 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -115,7 +115,7 @@ class Tools(Generic[Context]): # Basic Navigation Actions @self.registry.action( - 'Search query (defaults DuckDuckGo). Options: duckduckgo, google, bing.', + 'Search query.', param_model=SearchAction, ) async def search(params: SearchAction, browser_session: BrowserSession): @@ -158,7 +158,7 @@ class Tools(Generic[Context]): return ActionResult(error=f'Failed to search {params.search_engine} for "{params.query}": {str(e)}') @self.registry.action( - 'Navigate to URL. Set new_tab=True to open in new tab.', + 'Navigate to URL.', param_model=GoToUrlAction, ) async def go_to_url(params: GoToUrlAction, browser_session: BrowserSession): @@ -218,7 +218,7 @@ class Tools(Generic[Context]): error_msg = f'Failed to go back: {str(e)}' return ActionResult(error=error_msg) - @self.registry.action('Wait x seconds (default 3, max 30). ') + @self.registry.action('Wait for page load.') async def wait(seconds: int = 3): # Cap wait time at maximum 30 seconds # Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds @@ -234,7 +234,7 @@ class Tools(Generic[Context]): # Element Interaction Actions @self.registry.action( - 'Click element by index from browser_state. Set ctrl=True to open in new tab.', + 'Click element.', param_model=ClickElementAction, ) async def click(params: ClickElementAction, browser_session: BrowserSession): @@ -288,7 +288,7 @@ class Tools(Generic[Context]): return ActionResult(error=error_msg) @self.registry.action( - 'Input text into element by index from browser_state.', + 'Input text.', param_model=InputTextAction, ) async def input_text( @@ -350,7 +350,7 @@ class Tools(Generic[Context]): return ActionResult(error=error_msg) @self.registry.action( - 'Upload file to element by index from browser_state.', + 'Upload file.', param_model=UploadFileAction, ) async def upload_file( @@ -501,7 +501,7 @@ class Tools(Generic[Context]): # Tab Management Actions - @self.registry.action('Switch to tab by tab_id.', param_model=SwitchTabAction) + @self.registry.action('Switch tab.', param_model=SwitchTabAction) async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession): # Simple switch tab logic try: @@ -523,7 +523,7 @@ class Tools(Generic[Context]): memory = f'Attempted to switch to tab #{params.tab_id}' return ActionResult(extracted_content=memory, long_term_memory=memory) - @self.registry.action('Close tab by tab_id.', param_model=CloseTabAction) + @self.registry.action('Close tab.', param_model=CloseTabAction) async def close_tab(params: CloseTabAction, browser_session: BrowserSession): # Simple close tab logic try: @@ -555,11 +555,7 @@ class Tools(Generic[Context]): # This action is temporarily disabled as it needs refactoring to use events @self.registry.action( - """Extract semantic data from page markdown via LLM query (e.g. product info, prices). -Use when: on right page, know what to extract, haven't used on same page before. -Can't get: interactive elements (buttons, links, dropdowns). -Set extract_links=True for URLs. Use start_from_char if truncated. -If fails, use scroll_to_text or scroll instead.""", + """Extract page data via LLM. Use when on right page, know what to extract. Can't get interactive elements.""", ) async def extract_structured_data( query: str, @@ -679,9 +675,7 @@ You will be given a query and the markdown of a webpage that has been filtered t raise RuntimeError(str(e)) @self.registry.action( - """Scroll page by num_pages (down=True for down, False for up). Default 1 page, use 0.5 for half, 10 for bottom. -For specific containers, use frame_element_index from browser_state (works with dropdowns, custom UI). -Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px fallback.""", + 'Scroll page. Multiple pages scroll sequentially.', param_model=ScrollAction, ) async def scroll(params: ScrollAction, browser_session: BrowserSession): @@ -790,7 +784,7 @@ Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px return ActionResult(error=error_msg) @self.registry.action( - 'Send special keys (Escape, Enter, PageDown) or shortcuts (Control+o, Control+Shift+T).', + 'Send keys.', param_model=SendKeysAction, ) async def send_keys(params: SendKeysAction, browser_session: BrowserSession): @@ -809,7 +803,7 @@ Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px return ActionResult(error=error_msg) @self.registry.action( - description='Scroll to text on page. Prefer over step-by-step scrolling when target known.', + description='Scroll to text.', ) async def scroll_to_text(text: str, browser_session: BrowserSession): # type: ignore # Dispatch scroll to text event @@ -831,7 +825,7 @@ Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px long_term_memory=f"Tried scrolling to text '{text}' but it was not found", ) - @self.registry.action('Request screenshot in next browser state. Use for visual confirmation or complex visual content.') + @self.registry.action('Request screenshot.') async def take_screenshot(): """Request that a screenshot be included in the next observation""" memory = 'Requested screenshot for next observation' @@ -847,7 +841,7 @@ Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px # Dropdown Actions @self.registry.action( - 'Get dropdown values (, ARIA, custom). Searches element and children.', + 'Select dropdown option.', param_model=SelectDropdownOptionAction, ) async def select_dropdown_option(params: SelectDropdownOptionAction, browser_session: BrowserSession): @@ -916,7 +910,7 @@ Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px return ActionResult(error=error_msg) # File System Actions - @self.registry.action('Write/append to file (.md, .txt, .json, .csv, .pdf). PDF: write markdown, auto-converts to PDF.') + @self.registry.action('Write/append file.') async def write_file( file_name: str, content: str, @@ -936,15 +930,13 @@ Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px logger.info(f'💾 {result}') return ActionResult(extracted_content=result, long_term_memory=result) - @self.registry.action( - 'Replace old_str with new_str in file. old_str must match exactly. For todo.md updates or specific edits.' - ) + @self.registry.action('Replace in file.') async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem): result = await file_system.replace_file_str(file_name, old_str, new_str) logger.info(f'💾 {result}') return ActionResult(extracted_content=result, long_term_memory=result) - @self.registry.action('Read file from file system.') + @self.registry.action('Read file.') async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem): if available_file_paths and file_name in available_file_paths: result = await file_system.read_file(file_name, external_file=True) @@ -974,22 +966,11 @@ Multiple pages (>=1.0) scroll sequentially. Page height from viewport or 1000px ) @self.registry.action( - """Execute JavaScript with Runtime.evaluate (returnByValue:true, awaitPromise:true). + """Execute JS. MUST wrap in IIFE: (function(){...})() or async: (async function(){...})() +Use when other tools fail or need custom logic. -SYNTAX RULES - ALWAYS wrap in IIFE or get "Uncaught at line 0": -- IIFE: (function(){ ... })() or async: (async function(){ ... })() -- Add try-catch, proper semicolons, validate elements exist - -Use when other tools fail or need custom logic (forms, hover, drag, extract links, custom selectors, shadow DOM, React/Vue/Angular, etc.). - -Examples: CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })() -CORRECT: (async function(){ try { await new Promise(r => setTimeout(r, 100)); return 'done'; } catch(e) { return 'Error: ' + e.message; } })() -WRONG: document.querySelector('#id').value (no IIFE) - -Shadow DOM: (function(){ try { const hosts = document.querySelectorAll('*'); for (let host of hosts) { if (host.shadowRoot) { const el = host.shadowRoot.querySelector('#target'); if (el) return el.textContent; } } return 'Not found'; } catch(e) { return 'Error: ' + e.message; } })() - -Returns strings, numbers, booleans, objects/arrays. Use JSON.stringify() for complex objects.""", +WRONG: document.querySelector('#id').value""", ) async def execute_js(code: str, browser_session: BrowserSession): # Execute JavaScript with proper error handling and promise support @@ -1154,7 +1135,7 @@ Returns strings, numbers, booleans, objects/arrays. Use JSON.stringify() for com self.display_files_in_done_text = display_files_in_done_text @self.registry.action( - 'Complete task with return text. Set success=True if finished, False if not (e.g. max steps reached).', + 'Complete task with structured output.', param_model=StructuredOutputAction[output_model], ) async def done(params: StructuredOutputAction): @@ -1176,7 +1157,7 @@ Returns strings, numbers, booleans, objects/arrays. Use JSON.stringify() for com else: @self.registry.action( - 'Complete task with summary. Set success=True if completed successfully. Include files in files_to_display.', + 'Complete task.', param_model=DoneAction, ) async def done(params: DoneAction, file_system: FileSystem): diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index c93ecc02a..a019890e6 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field # Action Input Models class SearchAction(BaseModel): query: str - search_engine: str = 'duckduckgo' # Options: 'duckduckgo', 'google', 'bing' + search_engine: str = Field(default='duckduckgo', description='duckduckgo, google, bing') # Backward compatibility alias @@ -15,65 +15,59 @@ SearchAction = SearchAction class GoToUrlAction(BaseModel): url: str - new_tab: bool = False # True to open in new tab, False to navigate in current tab + new_tab: bool = Field(default=False) class ClickElementAction(BaseModel): - index: int = Field(ge=1, description='index of the element to click') + index: int = Field(ge=1, description='from browser_state') ctrl: bool | None = Field( default=None, - description='Set to True to open the navigation in a new background tab (Ctrl+Click behavior). Optional.', + description='True=New background tab (Ctrl+Click)', ) # expect_download: bool = Field(default=False, description='set True if expecting a download, False otherwise') # moved to downloads_watchdog.py # click_count: int = 1 # TODO class InputTextAction(BaseModel): - index: int = Field(ge=0, description='index of the element to input text into, 0 is the page') + index: int = Field(ge=1, description='from browser_state') text: str - clear_existing: bool = Field(default=True, description='set True to clear existing text, False to append to existing text') + clear_existing: bool = Field(default=True, description='True to clear, False to append') class DoneAction(BaseModel): - text: str - success: bool - files_to_display: list[str] | None = [] + text: str = Field(description='summary for user') + success: bool = Field(description='True if completed') + files_to_display: list[str] | None = Field(default=[], description='files to display') T = TypeVar('T', bound=BaseModel) class StructuredOutputAction(BaseModel, Generic[T]): - success: bool = True + success: bool = Field(default=True, description='True if finished, False if not') data: T class SwitchTabAction(BaseModel): - tab_id: str = Field( - min_length=4, - max_length=4, - description="tab_id to switch to which is displayed as 'Tab ' in the browser_state.", - ) # last 4 chars of TargetID + tab_id: str = Field(min_length=4, max_length=4, description="from browser_state ('Tab ')") class CloseTabAction(BaseModel): - tab_id: str = Field( - min_length=4, max_length=4, description="tab_id to close which is displayed as 'Tab ' in the browser_state." - ) # last 4 chars of TargetID + tab_id: str = Field(min_length=4, max_length=4, description="from browser_state ('Tab ')") class ScrollAction(BaseModel): - down: bool # True to scroll down, False to scroll up - num_pages: float = 1.0 # Number of pages to scroll (0.5 = half page, 1.0 = one page, etc.) - frame_element_index: int | None = None # Optional element index to find scroll container for + down: bool = Field(description='True=down, False=up') + num_pages: float = Field(default=1.0, description='pages to scroll (0.5=half, 1=page, 10=bottom)') + frame_element_index: int | None = Field(default=None, description='index for specific container') class SendKeysAction(BaseModel): - keys: str + keys: str = Field(description='keys (Escape, Enter, PageDown) or shortcuts (Control+o)') class UploadFileAction(BaseModel): - index: int + index: int = Field(description='from browser_state') path: str @@ -82,19 +76,15 @@ class ExtractPageContentAction(BaseModel): class NoParamsAction(BaseModel): - """ - Accepts absolutely anything in the incoming data - and discards it, so the final parsed model is empty. - """ + """Accepts any input, discards it, returns empty model.""" model_config = ConfigDict(extra='ignore') - # No fields defined - all inputs are ignored automatically class GetDropdownOptionsAction(BaseModel): - index: int = Field(ge=1, description='index of the dropdown element to get the option values for') + index: int = Field(ge=1, description='dropdown from browser_state') class SelectDropdownOptionAction(BaseModel): - index: int = Field(ge=1, description='index of the dropdown element to select an option for') - text: str = Field(description='the text or exact value of the option to select') + index: int = Field(ge=1, description='dropdown from browser_state') + text: str = Field(description='exact text/value to select') From 7b995b7fc123e50ea4461740202c35ff8bc6b3ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 11:29:19 -0700 Subject: [PATCH 10/45] Refactor action field descriptions in AgentOutput models - Removed redundant description from action field in AgentOutput and its subclasses. - Updated action extraction documentation in Tools to clarify usage and limitations. - Enhanced search_engine field description in SearchAction for better clarity on default behavior. --- browser_use/agent/views.py | 5 ++--- browser_use/tools/service.py | 6 +++--- browser_use/tools/views.py | 4 +++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index a8ae7f3c4..c816c41f2 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -155,7 +155,6 @@ class AgentOutput(BaseModel): next_goal: str | None = None action: list[ActionModel] = Field( ..., - description='List of actions to execute', json_schema_extra={'min_items': 1}, # Ensure at least one action is provided ) @@ -208,7 +207,7 @@ class AgentOutput(BaseModel): __base__=AgentOutputNoThinking, action=( list[custom_actions], # type: ignore - Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}), + Field(..., json_schema_extra={'min_items': 1}), ), __module__=AgentOutputNoThinking.__module__, ) @@ -237,7 +236,7 @@ class AgentOutput(BaseModel): __base__=AgentOutputFlashMode, action=( list[custom_actions], # type: ignore - Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}), + Field(..., json_schema_extra={'min_items': 1}), ), __module__=AgentOutputFlashMode.__module__, ) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 4c3b883f0..8e24773e9 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -555,7 +555,7 @@ class Tools(Generic[Context]): # This action is temporarily disabled as it needs refactoring to use events @self.registry.action( - """Extract page data via LLM. Use when on right page, know what to extract. Can't get interactive elements.""", + """Extract page data via LLM. Use when on right page, know what to extract. Can't get interactive elements. Don't call again on same page with same query.""", ) async def extract_structured_data( query: str, @@ -967,8 +967,8 @@ You will be given a query and the markdown of a webpage that has been filtered t @self.registry.action( """Execute JS. MUST wrap in IIFE: (function(){...})() or async: (async function(){...})() -Use when other tools fail or need custom logic. - +Use when other tools fail. Limit output. For complex objects use JSON.stringify(). +Don't use comments. CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })() WRONG: document.querySelector('#id').value""", ) diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index a019890e6..44fab1347 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -6,7 +6,9 @@ from pydantic import BaseModel, ConfigDict, Field # Action Input Models class SearchAction(BaseModel): query: str - search_engine: str = Field(default='duckduckgo', description='duckduckgo, google, bing') + search_engine: str = Field( + default='duckduckgo', description='duckduckgo, google, bing (use duckduckgo by default because less captchas)' + ) # Backward compatibility alias From 55d10605fd0c858cd091bf9d519a3853632a6be2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 11:59:12 -0700 Subject: [PATCH 11/45] Shorter tools --- browser_use/tools/service.py | 44 ++++++++++++++++++------------------ browser_use/tools/views.py | 28 +++++++++++------------ 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 8e24773e9..3e7b526a6 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -131,10 +131,10 @@ class Tools(Generic[Context]): 'bing': f'https://www.bing.com/search?q={encoded_query}', } - if params.search_engine.lower() not in search_engines: - return ActionResult(error=f'Unsupported search engine: {params.search_engine}. Options: duckduckgo, google, bing') + if params.engine.lower() not in search_engines: + return ActionResult(error=f'Unsupported search engine: {params.engine}. Options: duckduckgo, google, bing') - search_url = search_engines[params.search_engine.lower()] + search_url = search_engines[params.engine.lower()] # Simple tab logic: use current tab by default use_new_tab = False @@ -149,13 +149,13 @@ class Tools(Generic[Context]): ) await event await event.event_result(raise_if_any=True, raise_if_none=False) - memory = f"Searched {params.search_engine.title()} for '{params.query}'" + memory = f"Searched {params.engine.title()} for '{params.query}'" msg = f'🔍 {memory}' logger.info(msg) return ActionResult(extracted_content=memory, long_term_memory=memory) except Exception as e: - logger.error(f'Failed to search {params.search_engine}: {e}') - return ActionResult(error=f'Failed to search {params.search_engine} for "{params.query}": {str(e)}') + logger.error(f'Failed to search {params.engine}: {e}') + return ActionResult(error=f'Failed to search {params.engine} for "{params.query}": {str(e)}') @self.registry.action( 'Navigate to URL.', @@ -313,7 +313,7 @@ class Tools(Generic[Context]): TypeTextEvent( node=node, text=params.text, - clear_existing=params.clear_existing, + clear_existing=params.clear, is_sensitive=has_sensitive_data, sensitive_key_name=sensitive_key_name, ) @@ -683,18 +683,18 @@ You will be given a query and the markdown of a webpage that has been filtered t # Look up the node from the selector map if index is provided # Special case: index 0 means scroll the whole page (root/body element) node = None - if params.frame_element_index is not None and params.frame_element_index != 0: - node = await browser_session.get_element_by_index(params.frame_element_index) + if params.frame_idx is not None and params.frame_idx != 0: + node = await browser_session.get_element_by_index(params.frame_idx) if node is None: # Element does not exist - msg = f'Element index {params.frame_element_index} not found in browser state' + msg = f'Element index {params.frame_idx} not found in browser state' return ActionResult(error=msg) direction = 'down' if params.down else 'up' target = ( 'the page' - if params.frame_element_index is None or params.frame_element_index == 0 - else f'element {params.frame_element_index}' + if params.frame_idx is None or params.frame_idx == 0 + else f'element {params.frame_idx}' ) # Get actual viewport height for more accurate scrolling @@ -715,11 +715,11 @@ You will be given a query and the markdown of a webpage that has been filtered t logger.debug(f'Failed to get viewport height, using fallback 1000px: {e}') # For multiple pages (>=1.0), scroll one page at a time to ensure each scroll completes - if params.num_pages >= 1.0: + if params.pages >= 1.0: import asyncio - num_full_pages = int(params.num_pages) - remaining_fraction = params.num_pages - num_full_pages + num_full_pages = int(params.pages) + remaining_fraction = params.pages - num_full_pages completed_scrolls = 0 @@ -761,19 +761,19 @@ You will be given a query and the markdown of a webpage that has been filtered t except Exception as e: logger.warning(f'Fractional scroll failed: {e}') - if params.num_pages == 1.0: + if params.pages == 1.0: long_term_memory = f'Scrolled {direction} {target} by one page ({viewport_height}px)' else: - long_term_memory = f'Scrolled {direction} {target} by {completed_scrolls:.1f} pages (requested: {params.num_pages}, {viewport_height}px per page)' + long_term_memory = f'Scrolled {direction} {target} by {completed_scrolls:.1f} pages (requested: {params.pages}, {viewport_height}px per page)' else: # For fractional pages <1.0, do single scroll - pixels = int(params.num_pages * viewport_height) + pixels = int(params.pages * viewport_height) event = browser_session.event_bus.dispatch( ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node) ) await event await event.event_result(raise_if_any=True, raise_if_none=False) - long_term_memory = f'Scrolled {direction} {target} by {params.num_pages} pages ({viewport_height}px per page)' + long_term_memory = f'Scrolled {direction} {target} by {params.pages} pages ({viewport_height}px per page)' msg = f'🔍 {long_term_memory}' logger.info(msg) @@ -1170,10 +1170,10 @@ WRONG: document.querySelector('#id').value""", memory += f' - {len_text - len_max_memory} more characters' attachments = [] - if params.files_to_display: + if params.files: if self.display_files_in_done_text: file_msg = '' - for file_name in params.files_to_display: + for file_name in params.files: if file_name == 'todo.md': continue file_content = file_system.display_file(file_name) @@ -1186,7 +1186,7 @@ WRONG: document.querySelector('#id').value""", else: logger.warning('Agent wanted to display files but none were found') else: - for file_name in params.files_to_display: + for file_name in params.files: if file_name == 'todo.md': continue file_content = file_system.display_file(file_name) diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index 44fab1347..692757c6f 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -31,14 +31,14 @@ class ClickElementAction(BaseModel): class InputTextAction(BaseModel): - index: int = Field(ge=1, description='from browser_state') + index: int = Field(ge=1, description='index') text: str - clear_existing: bool = Field(default=True, description='True to clear, False to append') + clear: bool = Field(default=True, description='1=clear, 0=append') class DoneAction(BaseModel): text: str = Field(description='summary for user') - success: bool = Field(description='True if completed') + success: bool = Field(description='True if user_request completed successfully') files_to_display: list[str] | None = Field(default=[], description='files to display') @@ -46,22 +46,22 @@ T = TypeVar('T', bound=BaseModel) class StructuredOutputAction(BaseModel, Generic[T]): - success: bool = Field(default=True, description='True if finished, False if not') + success: bool = Field(default=True, description='1=done') data: T class SwitchTabAction(BaseModel): - tab_id: str = Field(min_length=4, max_length=4, description="from browser_state ('Tab ')") + tab_id: str = Field(min_length=4, max_length=4, description='4-char id') class CloseTabAction(BaseModel): - tab_id: str = Field(min_length=4, max_length=4, description="from browser_state ('Tab ')") + tab_id: str = Field(min_length=4, max_length=4, description='4-char id') class ScrollAction(BaseModel): - down: bool = Field(description='True=down, False=up') - num_pages: float = Field(default=1.0, description='pages to scroll (0.5=half, 1=page, 10=bottom)') - frame_element_index: int | None = Field(default=None, description='index for specific container') + down: bool = Field(description='1=down, 0=up') + pages: float = Field(default=1.0, description='0.5=half, 1=pg, 10=bottom') + frame_index: int | None = Field(default=None, description='container index') class SendKeysAction(BaseModel): @@ -69,7 +69,7 @@ class SendKeysAction(BaseModel): class UploadFileAction(BaseModel): - index: int = Field(description='from browser_state') + index: int = Field(description='index') path: str @@ -78,15 +78,13 @@ class ExtractPageContentAction(BaseModel): class NoParamsAction(BaseModel): - """Accepts any input, discards it, returns empty model.""" - model_config = ConfigDict(extra='ignore') class GetDropdownOptionsAction(BaseModel): - index: int = Field(ge=1, description='dropdown from browser_state') + index: int = Field(ge=1, description='index') class SelectDropdownOptionAction(BaseModel): - index: int = Field(ge=1, description='dropdown from browser_state') - text: str = Field(description='exact text/value to select') + index: int = Field(ge=1, description='index') + text: str = Field(description='exact text/value') From 18b7dd95a4c24ce9c8d9d1f27b78c8bfffaffb03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:16:13 -0700 Subject: [PATCH 12/45] Fix param --- browser_use/agent/system_prompt.md | 4 ++-- browser_use/agent/system_prompt_no_thinking.md | 4 ++-- browser_use/tools/service.py | 12 ++++-------- browser_use/tools/views.py | 4 ++-- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md index b48a68cb6..fb58b6044 100644 --- a/browser_use/agent/system_prompt.md +++ b/browser_use/agent/system_prompt.md @@ -118,9 +118,9 @@ You must call the `done` action in one of two cases: The `done` action is your opportunity to terminate and share your findings with the user. - Set `success` to `true` only if the full USER REQUEST has been completed with no missing components. - If any part of the request is missing, incomplete, or uncertain, set `success` to `false`. -- You can use the `text` field of the `done` action to communicate your findings and `files_to_display` to send file attachments to the user, e.g. `["results.md"]`. +- You can use the `text` field of the `done` action to communicate your findings and `files` to send file attachments to the user, e.g. `["results.md"]`. - Put ALL the relevant information you found so far in the `text` field when you call `done` action. -- Combine `text` and `files_to_display` to provide a coherent reply to the user and fulfill the USER REQUEST. +- Combine `text` and `files` to provide a coherent reply to the user and fulfill the USER REQUEST. - You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions. - If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer. - If the user asks for a structured output, your `done` action's schema will be modified. Take this schema into account when solving the task! diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md index c905e5bb6..6eddc625d 100644 --- a/browser_use/agent/system_prompt_no_thinking.md +++ b/browser_use/agent/system_prompt_no_thinking.md @@ -118,9 +118,9 @@ You must call the `done` action in one of two cases: The `done` action is your opportunity to terminate and share your findings with the user. - Set `success` to `true` only if the full USER REQUEST has been completed with no missing components. - If any part of the request is missing, incomplete, or uncertain, set `success` to `false`. -- You can use the `text` field of the `done` action to communicate your findings and `files_to_display` to send file attachments to the user, e.g. `["results.md"]`. +- You can use the `text` field of the `done` action to communicate your findings and `files` to send file attachments to the user, e.g. `["results.md"]`. - Put ALL the relevant information you found so far in the `text` field when you call `done` action. -- Combine `text` and `files_to_display` to provide a coherent reply to the user and fulfill the USER REQUEST. +- Combine `text` and `files` to provide a coherent reply to the user and fulfill the USER REQUEST. - You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions. - If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer. - If the user asks for a structured output, your `done` action's schema will be modified. Take this schema into account when solving the task! diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 3e7b526a6..dbed5a1dc 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -683,19 +683,15 @@ You will be given a query and the markdown of a webpage that has been filtered t # Look up the node from the selector map if index is provided # Special case: index 0 means scroll the whole page (root/body element) node = None - if params.frame_idx is not None and params.frame_idx != 0: - node = await browser_session.get_element_by_index(params.frame_idx) + if params.frame_index is not None and params.frame_index != 0: + node = await browser_session.get_element_by_index(params.frame_index) if node is None: # Element does not exist - msg = f'Element index {params.frame_idx} not found in browser state' + msg = f'Element index {params.frame_index} not found in browser state' return ActionResult(error=msg) direction = 'down' if params.down else 'up' - target = ( - 'the page' - if params.frame_idx is None or params.frame_idx == 0 - else f'element {params.frame_idx}' - ) + target = 'the page' if params.frame_index is None or params.frame_index == 0 else f'element {params.frame_index}' # Get actual viewport height for more accurate scrolling try: diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index 692757c6f..c73752ba0 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field # Action Input Models class SearchAction(BaseModel): query: str - search_engine: str = Field( + engine: str = Field( default='duckduckgo', description='duckduckgo, google, bing (use duckduckgo by default because less captchas)' ) @@ -39,7 +39,7 @@ class InputTextAction(BaseModel): class DoneAction(BaseModel): text: str = Field(description='summary for user') success: bool = Field(description='True if user_request completed successfully') - files_to_display: list[str] | None = Field(default=[], description='files to display') + files: list[str] | None = Field(default=[], description='files to display') T = TypeVar('T', bound=BaseModel) From ad9b1747629c00df8ad1724c3de2ed064de86653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:18:21 -0700 Subject: [PATCH 13/45] Fix param --- browser_use/tools/service.py | 8 ++++---- browser_use/tools/views.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index dbed5a1dc..3d9598b31 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -683,15 +683,15 @@ You will be given a query and the markdown of a webpage that has been filtered t # Look up the node from the selector map if index is provided # Special case: index 0 means scroll the whole page (root/body element) node = None - if params.frame_index is not None and params.frame_index != 0: - node = await browser_session.get_element_by_index(params.frame_index) + if params.index is not None and params.index != 0: + node = await browser_session.get_element_by_index(params.index) if node is None: # Element does not exist - msg = f'Element index {params.frame_index} not found in browser state' + msg = f'Element index {params.index} not found in browser state' return ActionResult(error=msg) direction = 'down' if params.down else 'up' - target = 'the page' if params.frame_index is None or params.frame_index == 0 else f'element {params.frame_index}' + target = 'the page' if params.index is None or params.index == 0 else f'element {params.index}' # Get actual viewport height for more accurate scrolling try: diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index c73752ba0..8b6ab5cc1 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -61,7 +61,7 @@ class CloseTabAction(BaseModel): class ScrollAction(BaseModel): down: bool = Field(description='1=down, 0=up') pages: float = Field(default=1.0, description='0.5=half, 1=pg, 10=bottom') - frame_index: int | None = Field(default=None, description='container index') + index: int | None = Field(default=None, description='Use to scroll in specific container with that element') class SendKeysAction(BaseModel): @@ -69,7 +69,7 @@ class SendKeysAction(BaseModel): class UploadFileAction(BaseModel): - index: int = Field(description='index') + index: int path: str @@ -82,7 +82,7 @@ class NoParamsAction(BaseModel): class GetDropdownOptionsAction(BaseModel): - index: int = Field(ge=1, description='index') + index: int class SelectDropdownOptionAction(BaseModel): From 968414b9826229dc49961fba6378b3fd1c61f162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:48:25 -0700 Subject: [PATCH 14/45] Compress gemini schema --- browser_use/agent/views.py | 3 --- browser_use/llm/google/chat.py | 2 +- browser_use/llm/schema.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index c816c41f2..1234471a7 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -187,7 +187,6 @@ class AgentOutput(BaseModel): ), __module__=AgentOutput.__module__, ) - model_.__doc__ = 'AgentOutput model with custom actions' return model_ @staticmethod @@ -212,7 +211,6 @@ class AgentOutput(BaseModel): __module__=AgentOutputNoThinking.__module__, ) - model.__doc__ = 'AgentOutput model with custom actions' return model @staticmethod @@ -241,7 +239,6 @@ class AgentOutput(BaseModel): __module__=AgentOutputFlashMode.__module__, ) - model.__doc__ = 'AgentOutput model with custom actions' return model diff --git a/browser_use/llm/google/chat.py b/browser_use/llm/google/chat.py index ad511bb26..3c11186a0 100644 --- a/browser_use/llm/google/chat.py +++ b/browser_use/llm/google/chat.py @@ -255,7 +255,7 @@ class ChatGoogle(BaseChatModel): self.logger.debug(f'🔧 Requesting structured output for {output_format.__name__}') config['response_mime_type'] = 'application/json' # Convert Pydantic model to Gemini-compatible schema - optimized_schema = SchemaOptimizer.create_optimized_json_schema(output_format) + optimized_schema = SchemaOptimizer.create_gemini_optimized_schema(output_format) gemini_schema = self._fix_gemini_schema(optimized_schema) config['response_schema'] = gemini_schema diff --git a/browser_use/llm/schema.py b/browser_use/llm/schema.py index 22e54b2b9..9b76ce4f1 100644 --- a/browser_use/llm/schema.py +++ b/browser_use/llm/schema.py @@ -159,3 +159,31 @@ class SchemaOptimizer: elif isinstance(schema, list): for item in schema: SchemaOptimizer._make_strict_compatible(item) + + @staticmethod + def create_gemini_optimized_schema(model: type[BaseModel]) -> dict[str, Any]: + """ + Create Gemini-optimized schema that removes 'required' arrays to save tokens. + Gemini can infer required fields from context since all fields are required. + + Args: + model: The Pydantic model to optimize + + Returns: + Optimized schema without required arrays + """ + # Start with standard optimized schema + schema = SchemaOptimizer.create_optimized_json_schema(model) + + def remove_required_arrays(obj: Any) -> Any: + """Recursively remove 'required' arrays""" + if isinstance(obj, dict): + # Remove 'required' key + result = {k: v for k, v in obj.items() if k != 'required'} + # Recursively process nested structures + return {k: remove_required_arrays(v) for k, v in result.items()} + elif isinstance(obj, list): + return [remove_required_arrays(item) for item in obj] + return obj + + return remove_required_arrays(schema) From 1248c07e83ab9c52885b90348d8894bc20f421f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 13:02:43 -0700 Subject: [PATCH 15/45] Remove descripitons --- browser_use/llm/schema.py | 5 +++-- browser_use/tools/registry/service.py | 2 -- browser_use/tools/service.py | 22 ++++++++-------------- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/browser_use/llm/schema.py b/browser_use/llm/schema.py index 9b76ce4f1..075564462 100644 --- a/browser_use/llm/schema.py +++ b/browser_use/llm/schema.py @@ -48,9 +48,10 @@ class SchemaOptimizer: if key == 'title' and not in_properties: continue - # Preserve FULL descriptions without truncation + # Preserve FULL descriptions without truncation, skip empty ones elif key == 'description': - optimized[key] = value + if value: # Only include non-empty descriptions + optimized[key] = value # Handle type field elif key == 'type': diff --git a/browser_use/tools/registry/service.py b/browser_use/tools/registry/service.py index 7e592b32b..33bbd0342 100644 --- a/browser_use/tools/registry/service.py +++ b/browser_use/tools/registry/service.py @@ -538,8 +538,6 @@ class Registry(Generic[Context]): union_type = Union[tuple(individual_action_models)] # type: ignore : Typing doesn't understand that the length is >= 2 (by design) class ActionModelUnion(RootModel[union_type]): # type: ignore - """Union of all available action models that maintains ActionModel interface""" - def get_index(self) -> int | None: """Delegate get_index to the underlying action model""" if hasattr(self.root, 'get_index'): diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 3d9598b31..abfab242d 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -204,7 +204,7 @@ class Tools(Generic[Context]): # Return error in ActionResult instead of re-raising return ActionResult(error=f'Navigation failed: {str(e)}') - @self.registry.action('Go back', param_model=NoParamsAction) + @self.registry.action('', param_model=NoParamsAction) async def go_back(_: NoParamsAction, browser_session: BrowserSession): try: event = browser_session.event_bus.dispatch(GoBackEvent()) @@ -218,7 +218,7 @@ class Tools(Generic[Context]): error_msg = f'Failed to go back: {str(e)}' return ActionResult(error=error_msg) - @self.registry.action('Wait for page load.') + @self.registry.action('Wait for page.') async def wait(seconds: int = 3): # Cap wait time at maximum 30 seconds # Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds @@ -501,7 +501,7 @@ class Tools(Generic[Context]): # Tab Management Actions - @self.registry.action('Switch tab.', param_model=SwitchTabAction) + @self.registry.action('', param_model=SwitchTabAction) async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession): # Simple switch tab logic try: @@ -523,7 +523,7 @@ class Tools(Generic[Context]): memory = f'Attempted to switch to tab #{params.tab_id}' return ActionResult(extracted_content=memory, long_term_memory=memory) - @self.registry.action('Close tab.', param_model=CloseTabAction) + @self.registry.action('', param_model=CloseTabAction) async def close_tab(params: CloseTabAction, browser_session: BrowserSession): # Simple close tab logic try: @@ -798,9 +798,7 @@ You will be given a query and the markdown of a webpage that has been filtered t error_msg = f'Failed to send keys: {str(e)}' return ActionResult(error=error_msg) - @self.registry.action( - description='Scroll to text.', - ) + @self.registry.action('') async def scroll_to_text(text: str, browser_session: BrowserSession): # type: ignore # Dispatch scroll to text event event = browser_session.event_bus.dispatch(ScrollToTextEvent(text=text)) @@ -821,7 +819,7 @@ You will be given a query and the markdown of a webpage that has been filtered t long_term_memory=f"Tried scrolling to text '{text}' but it was not found", ) - @self.registry.action('Request screenshot.') + @self.registry.action('') async def take_screenshot(): """Request that a screenshot be included in the next observation""" memory = 'Requested screenshot for next observation' @@ -932,7 +930,7 @@ You will be given a query and the markdown of a webpage that has been filtered t logger.info(f'💾 {result}') return ActionResult(extracted_content=result, long_term_memory=result) - @self.registry.action('Read file.') + @self.registry.action('') async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem): if available_file_paths and file_name in available_file_paths: result = await file_system.read_file(file_name, external_file=True) @@ -962,11 +960,7 @@ You will be given a query and the markdown of a webpage that has been filtered t ) @self.registry.action( - """Execute JS. MUST wrap in IIFE: (function(){...})() or async: (async function(){...})() -Use when other tools fail. Limit output. For complex objects use JSON.stringify(). -Don't use comments. -CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })() -WRONG: document.querySelector('#id').value""", + 'JS eval. Wrap in IIFE: (function(){...})(). Use try/catch. JSON.stringify() for objects.', ) async def execute_js(code: str, browser_session: BrowserSession): # Execute JavaScript with proper error handling and promise support From ae3b2a5926f99c984ec6b52aeb5e678ad54ef5d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 13:06:19 -0700 Subject: [PATCH 16/45] Remove descripitons --- browser_use/tools/service.py | 22 +++++++++++----------- browser_use/tools/views.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index abfab242d..829a4533a 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -115,7 +115,7 @@ class Tools(Generic[Context]): # Basic Navigation Actions @self.registry.action( - 'Search query.', + '', param_model=SearchAction, ) async def search(params: SearchAction, browser_session: BrowserSession): @@ -158,7 +158,7 @@ class Tools(Generic[Context]): return ActionResult(error=f'Failed to search {params.engine} for "{params.query}": {str(e)}') @self.registry.action( - 'Navigate to URL.', + '', param_model=GoToUrlAction, ) async def go_to_url(params: GoToUrlAction, browser_session: BrowserSession): @@ -218,7 +218,7 @@ class Tools(Generic[Context]): error_msg = f'Failed to go back: {str(e)}' return ActionResult(error=error_msg) - @self.registry.action('Wait for page.') + @self.registry.action('') async def wait(seconds: int = 3): # Cap wait time at maximum 30 seconds # Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds @@ -234,7 +234,7 @@ class Tools(Generic[Context]): # Element Interaction Actions @self.registry.action( - 'Click element.', + '', param_model=ClickElementAction, ) async def click(params: ClickElementAction, browser_session: BrowserSession): @@ -288,7 +288,7 @@ class Tools(Generic[Context]): return ActionResult(error=error_msg) @self.registry.action( - 'Input text.', + '', param_model=InputTextAction, ) async def input_text( @@ -350,7 +350,7 @@ class Tools(Generic[Context]): return ActionResult(error=error_msg) @self.registry.action( - 'Upload file.', + '', param_model=UploadFileAction, ) async def upload_file( @@ -780,7 +780,7 @@ You will be given a query and the markdown of a webpage that has been filtered t return ActionResult(error=error_msg) @self.registry.action( - 'Send keys.', + '', param_model=SendKeysAction, ) async def send_keys(params: SendKeysAction, browser_session: BrowserSession): @@ -835,7 +835,7 @@ You will be given a query and the markdown of a webpage that has been filtered t # Dropdown Actions @self.registry.action( - 'Get dropdown options.', + '', param_model=GetDropdownOptionsAction, ) async def get_dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession): @@ -861,7 +861,7 @@ You will be given a query and the markdown of a webpage that has been filtered t ) @self.registry.action( - 'Select dropdown option.', + '', param_model=SelectDropdownOptionAction, ) async def select_dropdown_option(params: SelectDropdownOptionAction, browser_session: BrowserSession): @@ -904,7 +904,7 @@ You will be given a query and the markdown of a webpage that has been filtered t return ActionResult(error=error_msg) # File System Actions - @self.registry.action('Write/append file.') + @self.registry.action('') async def write_file( file_name: str, content: str, @@ -924,7 +924,7 @@ You will be given a query and the markdown of a webpage that has been filtered t logger.info(f'💾 {result}') return ActionResult(extracted_content=result, long_term_memory=result) - @self.registry.action('Replace in file.') + @self.registry.action('') async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem): result = await file_system.replace_file_str(file_name, old_str, new_str) logger.info(f'💾 {result}') diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index 8b6ab5cc1..222679f23 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -31,7 +31,7 @@ class ClickElementAction(BaseModel): class InputTextAction(BaseModel): - index: int = Field(ge=1, description='index') + index: int = Field(ge=1, description='from browser_state') text: str clear: bool = Field(default=True, description='1=clear, 0=append') From 50f97000e4a0c33cea3bc820c2ed9b5ef1d4f7b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 13:21:14 -0700 Subject: [PATCH 17/45] file parameter naming --- browser_use/tools/service.py | 6 +++--- browser_use/tools/views.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 829a4533a..6e4502cc6 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -1160,10 +1160,10 @@ You will be given a query and the markdown of a webpage that has been filtered t memory += f' - {len_text - len_max_memory} more characters' attachments = [] - if params.files: + if params.files_to_display: if self.display_files_in_done_text: file_msg = '' - for file_name in params.files: + for file_name in params.files_to_display: if file_name == 'todo.md': continue file_content = file_system.display_file(file_name) @@ -1176,7 +1176,7 @@ You will be given a query and the markdown of a webpage that has been filtered t else: logger.warning('Agent wanted to display files but none were found') else: - for file_name in params.files: + for file_name in params.files_to_display: if file_name == 'todo.md': continue file_content = file_system.display_file(file_name) diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index 222679f23..105530a8c 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -39,7 +39,7 @@ class InputTextAction(BaseModel): class DoneAction(BaseModel): text: str = Field(description='summary for user') success: bool = Field(description='True if user_request completed successfully') - files: list[str] | None = Field(default=[], description='files to display') + files_to_display: list[str] | None = Field(default=[]) T = TypeVar('T', bound=BaseModel) From f0acb6b3386698d0d0ebfddd469fb716ecda5cc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 13:31:38 -0700 Subject: [PATCH 18/45] Replace tool names like go_to_url wtih navigate --- browser_use/agent/service.py | 2 +- browser_use/agent/system_prompt.md | 4 +- .../agent/system_prompt_no_thinking.md | 4 +- browser_use/tools/service.py | 22 +++---- docs/customize/agent/supported-models.mdx | 2 +- docs/customize/tools/available.mdx | 2 +- examples/features/initial_actions.py | 4 +- examples/models/qwen.py | 2 +- .../test_browser_event_ClickElementEvent.py | 40 ++++++------- ...t_browser_event_GetDropdownOptionsEvent.py | 32 +++++----- ...vent_GetDropdownOptionsEvent_aria_menus.py | 12 ++-- .../test_browser_event_NavigateToUrlEvent.py | 58 +++++++++---------- tests/ci/test_browser_event_ScrollEvent.py | 4 +- tests/ci/test_browser_session_output_paths.py | 2 +- ...r_watchdog_downloads_upload_full_circle.py | 6 +- tests/ci/test_tools.py | 30 +++++----- tests/ci/test_url_shortening.py | 10 ++-- tests/scripts/debug_iframe_scrolling.py | 2 +- 18 files changed, 119 insertions(+), 119 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 6b0a2fa0d..4868daaa8 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -323,7 +323,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): initial_url = self._extract_url_from_task(self.task) if initial_url: self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...') - initial_actions = [{'go_to_url': {'url': initial_url, 'new_tab': False}}] + initial_actions = [{'navigate': {'url': initial_url, 'new_tab': False}}] self.initial_url = initial_url diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md index fb58b6044..b5fb4616b 100644 --- a/browser_use/agent/system_prompt.md +++ b/browser_use/agent/system_prompt.md @@ -146,7 +146,7 @@ You can output multiple actions in one step. Try to be efficient where it makes Do not try multiple different paths in one step. Always have one clear goal per step. Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g. -- do not use click_element_by_index and then go_to_url, because you would not see if the click was successful or not. +- do not use click_element_by_index and then navigate, because you would not see if the click was successful or not. - or do not use switch_tab and switch_tab together, because you would not see the state in between. - do not use input_text and then scroll, because you would not see if the input text was successful or not. @@ -210,7 +210,7 @@ You must ALWAYS respond with a valid JSON in this exact format: "evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.", "memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.", "next_goal": "State the next immediate goal and action to achieve it, in one clear sentence." - "action":[{{"go_to_url": {{ "url": "url_value"}}}}, // ... more actions in sequence] + "action":[{{"navigate": {{ "url": "url_value"}}}}, // ... more actions in sequence] }} Action list should NEVER be empty. diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md index 6eddc625d..efdf8d2fe 100644 --- a/browser_use/agent/system_prompt_no_thinking.md +++ b/browser_use/agent/system_prompt_no_thinking.md @@ -145,7 +145,7 @@ You can output multiple actions in one step. Try to be efficient where it makes Do not try multiple different paths in one step. Always have one clear goal per step. Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g. -- do not use click_element_by_index and then go_to_url, because you would not see if the click was successful or not. +- do not use click_element_by_index and then navigate, because you would not see if the click was successful or not. - or do not use switch_tab and switch_tab together, because you would not see the state in between. - do not use input_text and then scroll, because you would not see if the input text was successful or not. @@ -206,7 +206,7 @@ You must ALWAYS respond with a valid JSON in this exact format: "evaluation_previous_goal": "One-sentence analysis of your last action. Clearly state success, failure, or uncertain.", "memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.", "next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.", - "action":[{{"go_to_url": {{ "url": "url_value"}}}}, // ... more actions in sequence] + "action":[{{"navigate": {{ "url": "url_value"}}}}, // ... more actions in sequence] }} Action list should NEVER be empty. diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 6e4502cc6..60b85e082 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -161,7 +161,7 @@ class Tools(Generic[Context]): '', param_model=GoToUrlAction, ) - async def go_to_url(params: GoToUrlAction, browser_session: BrowserSession): + async def navigate(params: GoToUrlAction, browser_session: BrowserSession): try: # Dispatch navigation event event = browser_session.event_bus.dispatch(NavigateToUrlEvent(url=params.url, new_tab=params.new_tab)) @@ -291,7 +291,7 @@ class Tools(Generic[Context]): '', param_model=InputTextAction, ) - async def input_text( + async def input( params: InputTextAction, browser_session: BrowserSession, has_sensitive_data: bool = False, @@ -502,7 +502,7 @@ class Tools(Generic[Context]): # Tab Management Actions @self.registry.action('', param_model=SwitchTabAction) - async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession): + async def switch(params: SwitchTabAction, browser_session: BrowserSession): # Simple switch tab logic try: target_id = await browser_session.get_target_id_from_tab_id(params.tab_id) @@ -524,7 +524,7 @@ class Tools(Generic[Context]): return ActionResult(extracted_content=memory, long_term_memory=memory) @self.registry.action('', param_model=CloseTabAction) - async def close_tab(params: CloseTabAction, browser_session: BrowserSession): + async def close(params: CloseTabAction, browser_session: BrowserSession): # Simple close tab logic try: target_id = await browser_session.get_target_id_from_tab_id(params.tab_id) @@ -557,7 +557,7 @@ class Tools(Generic[Context]): @self.registry.action( """Extract page data via LLM. Use when on right page, know what to extract. Can't get interactive elements. Don't call again on same page with same query.""", ) - async def extract_structured_data( + async def extract( query: str, extract_links: bool, browser_session: BrowserSession, @@ -799,7 +799,7 @@ You will be given a query and the markdown of a webpage that has been filtered t return ActionResult(error=error_msg) @self.registry.action('') - async def scroll_to_text(text: str, browser_session: BrowserSession): # type: ignore + async def find_text(text: str, browser_session: BrowserSession): # type: ignore # Dispatch scroll to text event event = browser_session.event_bus.dispatch(ScrollToTextEvent(text=text)) @@ -820,7 +820,7 @@ You will be given a query and the markdown of a webpage that has been filtered t ) @self.registry.action('') - async def take_screenshot(): + async def screenshot(): """Request that a screenshot be included in the next observation""" memory = 'Requested screenshot for next observation' msg = f'📸 {memory}' @@ -838,7 +838,7 @@ You will be given a query and the markdown of a webpage that has been filtered t '', param_model=GetDropdownOptionsAction, ) - async def get_dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession): + async def dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession): """Get all options from a native dropdown or ARIA menu""" # Look up the node from the selector map node = await browser_session.get_element_by_index(params.index) @@ -864,7 +864,7 @@ You will be given a query and the markdown of a webpage that has been filtered t '', param_model=SelectDropdownOptionAction, ) - async def select_dropdown_option(params: SelectDropdownOptionAction, browser_session: BrowserSession): + async def select_dropdown(params: SelectDropdownOptionAction, browser_session: BrowserSession): """Select dropdown option by the text of the option you want to select""" # Look up the node from the selector map node = await browser_session.get_element_by_index(params.index) @@ -925,7 +925,7 @@ You will be given a query and the markdown of a webpage that has been filtered t return ActionResult(extracted_content=result, long_term_memory=result) @self.registry.action('') - async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem): + async def replace_file(file_name: str, old_str: str, new_str: str, file_system: FileSystem): result = await file_system.replace_file_str(file_name, old_str, new_str) logger.info(f'💾 {result}') return ActionResult(extracted_content=result, long_term_memory=result) @@ -962,7 +962,7 @@ You will be given a query and the markdown of a webpage that has been filtered t @self.registry.action( 'JS eval. Wrap in IIFE: (function(){...})(). Use try/catch. JSON.stringify() for objects.', ) - async def execute_js(code: str, browser_session: BrowserSession): + async def evaluate(code: str, browser_session: BrowserSession): # Execute JavaScript with proper error handling and promise support cdp_session = await browser_session.get_or_create_cdp_session() diff --git a/docs/customize/agent/supported-models.mdx b/docs/customize/agent/supported-models.mdx index 35b67a7fa..740ed036b 100644 --- a/docs/customize/agent/supported-models.mdx +++ b/docs/customize/agent/supported-models.mdx @@ -215,7 +215,7 @@ llm = ChatOllama(model="llama3.1:8b") ## Qwen [example](https://github.com/browser-use/browser-use/blob/main/examples/models/qwen.py) Currently, only `qwen-vl-max` is recommended for Browser Use. Other Qwen models, including `qwen-max`, have issues with the action schema format. -Smaller Qwen models may return incorrect action schema formats (e.g., `actions: [{"go_to_url": "google.com"}]` instead of `[{"go_to_url": {"url": "google.com"}}]`). If you want to use other models, add concrete examples of the correct action format to your prompt. +Smaller Qwen models may return incorrect action schema formats (e.g., `actions: [{"navigate": "google.com"}]` instead of `[{"navigate": {"url": "google.com"}}]`). If you want to use other models, add concrete examples of the correct action format to your prompt. ```python from browser_use import Agent, ChatOpenAI diff --git a/docs/customize/tools/available.mdx b/docs/customize/tools/available.mdx index c045f3dca..488c79176 100644 --- a/docs/customize/tools/available.mdx +++ b/docs/customize/tools/available.mdx @@ -10,7 +10,7 @@ mode: "wide" ### Navigation & Browser Control - **`search`** - Search queries in Google -- **`go_to_url`** - Navigate to URLs +- **`navigate`** - Navigate to URLs - **`go_back`** - Go back in browser history - **`wait`** - Wait for specified seconds diff --git a/examples/features/initial_actions.py b/examples/features/initial_actions.py index 7e8d585c8..4a80f6f93 100644 --- a/examples/features/initial_actions.py +++ b/examples/features/initial_actions.py @@ -13,8 +13,8 @@ from browser_use import Agent, ChatOpenAI llm = ChatOpenAI(model='gpt-4.1-mini') initial_actions = [ - {'go_to_url': {'url': 'https://www.google.com', 'new_tab': True}}, - {'go_to_url': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}}, + {'navigate': {'url': 'https://www.google.com', 'new_tab': True}}, + {'navigate': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}}, ] agent = Agent( task='What theories are displayed on the page?', diff --git a/examples/models/qwen.py b/examples/models/qwen.py index 93e19aff2..fcb08e68a 100644 --- a/examples/models/qwen.py +++ b/examples/models/qwen.py @@ -13,7 +13,7 @@ base_url = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1' # so far we only had success with qwen-vl-max # other models, even qwen-max, do not return the right output format. They confuse the action schema. -# E.g. they return actions: [{"go_to_url": "google.com"}] instead of [{"go_to_url": {"url": "google.com"}}] +# E.g. they return actions: [{"navigate": "google.com"}] instead of [{"navigate": {"url": "google.com"}}] # If you want to use smaller models and you see they mix up the action schema, add concrete examples to your prompt of the right format. llm = ChatOpenAI(model='qwen-vl-max', api_key=api_key, base_url=base_url) diff --git a/tests/ci/test_browser_event_ClickElementEvent.py b/tests/ci/test_browser_event_ClickElementEvent.py index 2399594d3..976c476ec 100644 --- a/tests/ci/test_browser_event_ClickElementEvent.py +++ b/tests/ci/test_browser_event_ClickElementEvent.py @@ -130,12 +130,12 @@ class TestClickElementEvent: ) # Navigate to the clickable elements test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/clickable', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/clickable', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -218,12 +218,12 @@ class TestClickElementEvent: ) # Navigate to the new tab test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/newTab', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/newTab', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(1) # Wait for page to load @@ -306,12 +306,12 @@ class TestClickElementEvent: ) # Navigate to the comparison test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/comparison', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/comparison', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(1) @@ -393,12 +393,12 @@ class TestClickElementEvent: ) # Navigate to the page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/inline_offscreen', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/inline_offscreen', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(0.5) @@ -475,12 +475,12 @@ class TestClickElementEvent: ) # Navigate to the page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/block_in_inline', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/block_in_inline', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(0.5) @@ -563,12 +563,12 @@ class TestClickElementEvent: ) # Navigate to the page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/covered_element', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/covered_element', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(0.5) @@ -623,12 +623,12 @@ class TestClickElementEvent: ) # Navigate to the page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/file_input', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/file_input', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(0.5) @@ -684,12 +684,12 @@ class TestClickElementEvent: ) # Navigate to the page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/select_dropdown', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/select_dropdown', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(0.5) @@ -1081,12 +1081,12 @@ class TestClickElementEvent: ) # Navigate to the file upload test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/fileupload', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/fileupload', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -1227,11 +1227,11 @@ class TestClickElementEvent: ) # Navigate to the test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/upload-test', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/upload-test', new_tab=False)} from browser_use.agent.views import ActionModel class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await asyncio.sleep(0.5) diff --git a/tests/ci/test_browser_event_GetDropdownOptionsEvent.py b/tests/ci/test_browser_event_GetDropdownOptionsEvent.py index 3d3193e25..078410350 100644 --- a/tests/ci/test_browser_event_GetDropdownOptionsEvent.py +++ b/tests/ci/test_browser_event_GetDropdownOptionsEvent.py @@ -278,10 +278,10 @@ class TestGetDropdownOptionsEvent: async def test_native_select_dropdown(self, tools, browser_session: BrowserSession, base_url): """Test get_dropdown_options with native HTML select element.""" # Navigate to the native dropdown test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -336,10 +336,10 @@ class TestGetDropdownOptionsEvent: async def test_aria_menu_dropdown(self, tools, browser_session: BrowserSession, base_url): """Test get_dropdown_options with ARIA role='menu' element.""" # Navigate to the ARIA menu test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -398,10 +398,10 @@ class TestGetDropdownOptionsEvent: async def test_custom_dropdown(self, tools, browser_session: BrowserSession, base_url): """Test get_dropdown_options with custom dropdown implementation.""" # Navigate to the custom dropdown test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -456,10 +456,10 @@ class TestGetDropdownOptionsEvent: async def test_element_not_found_error(self, tools, browser_session: BrowserSession, base_url): """Test get_dropdown_options with invalid element index.""" # Navigate to any test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) @@ -486,10 +486,10 @@ class TestSelectDropdownOptionEvent: async def test_select_native_dropdown_option(self, tools, browser_session: BrowserSession, base_url): """Test select_dropdown_option with native HTML select element.""" # Navigate to the native dropdown test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) @@ -534,10 +534,10 @@ class TestSelectDropdownOptionEvent: async def test_select_aria_menu_option(self, tools, browser_session: BrowserSession, base_url): """Test select_dropdown_option with ARIA menu.""" # Navigate to the ARIA menu test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) @@ -586,10 +586,10 @@ class TestSelectDropdownOptionEvent: async def test_select_custom_dropdown_option(self, tools, browser_session: BrowserSession, base_url): """Test select_dropdown_option with custom dropdown.""" # Navigate to the custom dropdown test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) @@ -634,10 +634,10 @@ class TestSelectDropdownOptionEvent: async def test_select_invalid_option_error(self, tools, browser_session: BrowserSession, base_url): """Test select_dropdown_option with non-existent option text.""" # Navigate to the native dropdown test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) diff --git a/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py b/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py index 54a1a07a1..f74e656ca 100644 --- a/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py +++ b/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py @@ -152,10 +152,10 @@ class TestARIAMenuDropdown: async def test_get_dropdown_options_with_aria_menu(self, tools, browser_session: BrowserSession, base_url): """Test that get_dropdown_options can retrieve options from ARIA menus.""" # Navigate to the ARIA menu test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -219,10 +219,10 @@ class TestARIAMenuDropdown: async def test_select_dropdown_option_with_aria_menu(self, tools, browser_session: BrowserSession, base_url): """Test that select_dropdown_option can select an option from ARIA menus.""" # Navigate to the ARIA menu test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -289,10 +289,10 @@ class TestARIAMenuDropdown: async def test_get_dropdown_options_with_nested_aria_menu(self, tools, browser_session: BrowserSession, base_url): """Test that get_dropdown_options can handle nested ARIA menus (like Sort submenu).""" # Navigate to the ARIA menu test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) diff --git a/tests/ci/test_browser_event_NavigateToUrlEvent.py b/tests/ci/test_browser_event_NavigateToUrlEvent.py index a6a008d4c..088f6776f 100644 --- a/tests/ci/test_browser_event_NavigateToUrlEvent.py +++ b/tests/ci/test_browser_event_NavigateToUrlEvent.py @@ -59,15 +59,15 @@ def tools(): class TestNavigateToUrlEvent: - """Test NavigateToUrlEvent and go_to_url action functionality.""" + """Test NavigateToUrlEvent and navigate action functionality.""" - async def test_go_to_url_action(self, tools, browser_session: BrowserSession, base_url): + async def test_navigate_action(self, tools, browser_session: BrowserSession, base_url): """Test that GoToUrlAction navigates to the specified URL and test both state summary methods.""" # Test successful navigation to a valid page - action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None action_model = GoToUrlActionModel(**action_data) result = await tools.act(action_model, browser_session) @@ -77,14 +77,14 @@ class TestNavigateToUrlEvent: assert result.extracted_content is not None assert f'Navigated to {base_url}' in result.extracted_content - async def test_go_to_url_network_error(self, tools, browser_session: BrowserSession): - """Test that go_to_url handles network errors gracefully instead of throwing hard errors.""" - # Create action model for go_to_url with an invalid domain - action_data = {'go_to_url': GoToUrlAction(url='https://www.nonexistentdndbeyond.com/', new_tab=False)} + async def test_navigate_network_error(self, tools, browser_session: BrowserSession): + """Test that navigate handles network errors gracefully instead of throwing hard errors.""" + # Create action model for navigate with an invalid domain + action_data = {'navigate': GoToUrlAction(url='https://www.nonexistentdndbeyond.com/', new_tab=False)} # Create the ActionModel instance class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None action_model = GoToUrlActionModel(**action_data) @@ -123,17 +123,17 @@ class TestNavigateToUrlEvent: current_url = await browser_session.get_current_page_url() assert f'{base_url}/page1' in current_url - async def test_go_to_url_new_tab(self, tools, browser_session, base_url): + async def test_navigate_new_tab(self, tools, browser_session, base_url): """Test that GoToUrlAction with new_tab=True opens URL in a new tab.""" # Get initial tab count initial_tabs = await browser_session.get_tabs() initial_tab_count = len(initial_tabs) # Navigate to URL in new tab - action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)} + action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None result = await tools.act(GoToUrlActionModel(**action_data), browser_session) await asyncio.sleep(0.5) @@ -155,15 +155,15 @@ class TestNavigateToUrlEvent: async def test_navigate_javascript_url(self, tools, browser_session, base_url): """Test that javascript: URLs are handled appropriately.""" # Navigate to a normal page first - action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**action_data), browser_session) # Try to navigate to javascript: URL (should be handled gracefully) - js_action = {'go_to_url': GoToUrlAction(url='javascript:alert("test")', new_tab=False)} + js_action = {'navigate': GoToUrlAction(url='javascript:alert("test")', new_tab=False)} result = await tools.act(GoToUrlActionModel(**js_action), browser_session) # Should either succeed or fail gracefully @@ -174,10 +174,10 @@ class TestNavigateToUrlEvent: # Create a simple data URL data_url = 'data:text/html,Data URL Test

Data URL Content

' - action_data = {'go_to_url': GoToUrlAction(url=data_url, new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=data_url, new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None result = await tools.act(GoToUrlActionModel(**action_data), browser_session) @@ -210,10 +210,10 @@ class TestNavigateToUrlEvent: ) # Navigate to page with hash - action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page-with-anchors#section1', new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page-with-anchors#section1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None result = await tools.act(GoToUrlActionModel(**action_data), browser_session) @@ -247,10 +247,10 @@ class TestNavigateToUrlEvent: ) # Navigate with query parameters - action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/search?q=test+query&page=1', new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=f'{base_url}/search?q=test+query&page=1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None result = await tools.act(GoToUrlActionModel(**action_data), browser_session) @@ -267,19 +267,19 @@ class TestNavigateToUrlEvent: async def test_navigate_multiple_tabs(self, tools, browser_session, base_url): """Test navigating in multiple tabs sequentially.""" # Navigate to first page in current tab - action1 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} + action1 = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**action1), browser_session) # Open second page in new tab - action2 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)} + action2 = {'navigate': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)} await tools.act(GoToUrlActionModel(**action2), browser_session) # Open home page in yet another new tab - action3 = {'go_to_url': GoToUrlAction(url=base_url, new_tab=True)} + action3 = {'navigate': GoToUrlAction(url=base_url, new_tab=True)} await tools.act(GoToUrlActionModel(**action3), browser_session) # Should have 3 tabs now @@ -296,10 +296,10 @@ class TestNavigateToUrlEvent: # Using a private IP that's unlikely to respond timeout_url = 'http://192.0.2.1:8080/timeout' - action_data = {'go_to_url': GoToUrlAction(url=timeout_url, new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=timeout_url, new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None # This should complete without hanging indefinitely result = await tools.act(GoToUrlActionModel(**action_data), browser_session) @@ -317,10 +317,10 @@ class TestNavigateToUrlEvent: ) # Navigate to redirect URL - action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/redirect', new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=f'{base_url}/redirect', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None result = await tools.act(GoToUrlActionModel(**action_data), browser_session) diff --git a/tests/ci/test_browser_event_ScrollEvent.py b/tests/ci/test_browser_event_ScrollEvent.py index e8767384f..dd2c568f8 100644 --- a/tests/ci/test_browser_event_ScrollEvent.py +++ b/tests/ci/test_browser_event_ScrollEvent.py @@ -83,10 +83,10 @@ class TestScrollActions: """Test basic scroll action functionality.""" # Navigate to scrollable page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/scrollable', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/scrollable', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) diff --git a/tests/ci/test_browser_session_output_paths.py b/tests/ci/test_browser_session_output_paths.py index 9dfa13261..526cd039d 100644 --- a/tests/ci/test_browser_session_output_paths.py +++ b/tests/ci/test_browser_session_output_paths.py @@ -60,7 +60,7 @@ def interactive_llm(httpserver_url): "next_goal": "Navigate to the URL", "action": [ {{ - "go_to_url": {{ + "navigate": {{ "url": "{httpserver_url}", "new_tab": false }} diff --git a/tests/ci/test_browser_watchdog_downloads_upload_full_circle.py b/tests/ci/test_browser_watchdog_downloads_upload_full_circle.py index eb60f8d56..679ef14d4 100644 --- a/tests/ci/test_browser_watchdog_downloads_upload_full_circle.py +++ b/tests/ci/test_browser_watchdog_downloads_upload_full_circle.py @@ -162,10 +162,10 @@ class TestDownloadUploadFullCircle: # Step 1: Navigate to download page class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None result = await tools.act( - GoToUrlActionModel(go_to_url=GoToUrlAction(url=f'{base_url}/download-page', new_tab=False)), browser_session + GoToUrlActionModel(navigate=GoToUrlAction(url=f'{base_url}/download-page', new_tab=False)), browser_session ) assert result.error is None, f'Navigation to download page failed: {result.error}' @@ -228,7 +228,7 @@ class TestDownloadUploadFullCircle: for i, tab in enumerate(tabs_before): print(f' Tab {i}: {tab.url}') result = await tools.act( - GoToUrlActionModel(go_to_url=GoToUrlAction(url=f'{base_url}/upload-page', new_tab=True)), browser_session + GoToUrlActionModel(navigate=GoToUrlAction(url=f'{base_url}/upload-page', new_tab=True)), browser_session ) assert result.error is None, f'Navigation to upload page failed: {result.error}' print(f'✅ Navigation result: {result.extracted_content}') diff --git a/tests/ci/test_tools.py b/tests/ci/test_tools.py index e8625a0cf..2a86fac6c 100644 --- a/tests/ci/test_tools.py +++ b/tests/ci/test_tools.py @@ -96,7 +96,7 @@ class TestToolsIntegration: """Test that the registry contains the expected default actions.""" # Check that common actions are registered common_actions = [ - 'go_to_url', + 'navigate', 'search', 'click', 'input_text', @@ -125,10 +125,10 @@ class TestToolsIntegration: return ActionResult(extracted_content=f'Custom action executed with: {params.text} on {current_url}') # Navigate to a page first - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -209,10 +209,10 @@ class TestToolsIntegration: async def test_go_back_action(self, tools, browser_session, base_url): """Test that go_back action navigates to the previous page.""" # Navigate to first page - goto_action1 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} + goto_action1 = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action1), browser_session) @@ -221,7 +221,7 @@ class TestToolsIntegration: print(f'First page URL: {first_url}') # Navigate to second page - goto_action2 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page2', new_tab=False)} + goto_action2 = {'navigate': GoToUrlAction(url=f'{base_url}/page2', new_tab=False)} await tools.act(GoToUrlActionModel(**goto_action2), browser_session) # Verify we're on the second page @@ -259,10 +259,10 @@ class TestToolsIntegration: # Navigate to each page in sequence for url in urls: - action_data = {'go_to_url': GoToUrlAction(url=url, new_tab=False)} + action_data = {'navigate': GoToUrlAction(url=url, new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**action_data), browser_session) @@ -293,7 +293,7 @@ class TestToolsIntegration: assert 'scroll' not in excluded_tools.registry.registry.actions # But other actions are still there - assert 'go_to_url' in excluded_tools.registry.registry.actions + assert 'navigate' in excluded_tools.registry.registry.actions assert 'click' in excluded_tools.registry.registry.actions async def test_search_action(self, tools, browser_session, base_url): @@ -325,10 +325,10 @@ class TestToolsIntegration: file_system = FileSystem(temp_dir) # First navigate to a page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -392,10 +392,10 @@ class TestToolsIntegration: ) # Navigate to the dropdown test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/dropdown1', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/dropdown1', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) @@ -519,10 +519,10 @@ class TestToolsIntegration: ) # Navigate to the dropdown test page - goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/dropdown2', new_tab=False)} + goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/dropdown2', new_tab=False)} class GoToUrlActionModel(ActionModel): - go_to_url: GoToUrlAction | None = None + navigate: GoToUrlAction | None = None await tools.act(GoToUrlActionModel(**goto_action), browser_session) diff --git a/tests/ci/test_url_shortening.py b/tests/ci/test_url_shortening.py index 6ea45b15d..022e6de23 100644 --- a/tests/ci/test_url_shortening.py +++ b/tests/ci/test_url_shortening.py @@ -93,7 +93,7 @@ class TestUrlShorteningOutputProcessing: 'evaluation_previous_goal': 'Successfully processed the request', 'memory': f'Found useful info at {shortened_url}', 'next_goal': 'Complete the documentation review', - 'action': [{'go_to_url': {'url': shortened_url, 'new_tab': False}}], + 'action': [{'navigate': {'url': shortened_url, 'new_tab': False}}], } # Create properly typed AgentOutput with custom actions @@ -109,7 +109,7 @@ class TestUrlShorteningOutputProcessing: assert SUPER_LONG_URL in (agent_output.thinking or '') assert SUPER_LONG_URL in (agent_output.memory or '') action_data = agent_output.action[0].model_dump() - assert action_data['go_to_url']['url'] == SUPER_LONG_URL + assert action_data['navigate']['url'] == SUPER_LONG_URL class TestUrlShorteningEndToEnd: @@ -137,7 +137,7 @@ class TestUrlShorteningEndToEnd: 'evaluation_previous_goal': 'Starting documentation extraction', 'memory': f'Target URL: {shortened_url}', 'next_goal': 'Extract API documentation', - 'action': [{'go_to_url': {'url': shortened_url, 'new_tab': True}}], + 'action': [{'navigate': {'url': shortened_url, 'new_tab': True}}], } # Create AgentOutput with custom actions @@ -153,8 +153,8 @@ class TestUrlShorteningEndToEnd: assert SUPER_LONG_URL in (agent_output.thinking or '') assert SUPER_LONG_URL in (agent_output.memory or '') action_data = agent_output.action[0].model_dump() - assert action_data['go_to_url']['url'] == SUPER_LONG_URL - assert action_data['go_to_url']['new_tab'] is True + assert action_data['navigate']['url'] == SUPER_LONG_URL + assert action_data['navigate']['new_tab'] is True # Verify original shortened content is no longer present assert shortened_url not in (agent_output.thinking or '') diff --git a/tests/scripts/debug_iframe_scrolling.py b/tests/scripts/debug_iframe_scrolling.py index 7b02fba66..62682bed3 100644 --- a/tests/scripts/debug_iframe_scrolling.py +++ b/tests/scripts/debug_iframe_scrolling.py @@ -38,7 +38,7 @@ async def debug_iframe_scrolling(): "next_goal": "Navigate to the iframe test page", "action": [ { - "go_to_url": { + "navigate": { "url": "https://browser-use.github.io/stress-tests/challenges/iframe-inception-level1.html", "new_tab": false } From 0d731cfb26e6f6bd391ee7e5a0d378b746f2e1b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 13:32:16 -0700 Subject: [PATCH 19/45] Replace tool names like go_to_url wtih navigate --- browser_use/agent/system_prompt_flash.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/agent/system_prompt_flash.md b/browser_use/agent/system_prompt_flash.md index b1ee5da06..376f64609 100644 --- a/browser_use/agent/system_prompt_flash.md +++ b/browser_use/agent/system_prompt_flash.md @@ -7,7 +7,7 @@ Interactive Elements: All interactive elements will be provided in format as [in You must respond with a valid JSON in this exact format: {{ "memory": "Up to 5 sentences of specific reasoning about: Was the previous step successful / failed? What do we need to remember from the current state for the task? Plan ahead what are the best next actions. What's the next immediate goal? Depending on the complexity think longer. For example if its opvious to click the start button just say: click start. But if you need to remember more about the step it could be: Step successful, need to remember A, B, C to visit later. Next click on A.", - "action":[{{"go_to_url": {{ "url": "url_value"}}}}] + "action":[{{"navigate": {{ "url": "url_value"}}}}] }} From 47bf973285674cdd4dc3860968bbe2d27123798c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 13:46:31 -0700 Subject: [PATCH 20/45] Default value for extract_links --- browser_use/tools/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 60b85e082..37f16ace6 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -559,10 +559,10 @@ class Tools(Generic[Context]): ) async def extract( query: str, - extract_links: bool, browser_session: BrowserSession, page_extraction_llm: BaseChatModel, file_system: FileSystem, + extract_links: bool = False, start_from_char: int = 0, ): # Constants From 90d414042a2b6a11700fbc0486eaa5603e8967e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Sun, 5 Oct 2025 15:09:35 -0700 Subject: [PATCH 21/45] Update docs --- browser_use/agent/system_prompt.md | 42 +++++++++---------- .../agent/system_prompt_no_thinking.md | 36 ++++++++-------- browser_use/agent/views.py | 6 +-- browser_use/tools/service.py | 2 +- docs/customize/agent/prompting-guide.mdx | 8 ++-- docs/customize/tools/available.mdx | 32 +++++++------- examples/file_system/file_system.py | 2 +- 7 files changed, 63 insertions(+), 65 deletions(-) diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md index b5fb4616b..146adc23e 100644 --- a/browser_use/agent/system_prompt.md +++ b/browser_use/agent/system_prompt.md @@ -20,8 +20,8 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before, this will contain a screenshot. -5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used screenshot before, this will contain a screenshot. +5. This will be displayed only if your previous action was extract or read_file. This data is only shown in the current step. @@ -61,14 +61,14 @@ Examples: Note that: - Only elements with numeric indexes in [] are interactive - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index) -- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list. +- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input you might need to select the right option from the list. - Pure text elements without [] are not interactive. -If you used take_screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. +If you used screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot. -Use take_screenshot if you are unsure or simply want more information. +Use screenshot if you are unsure or simply want more information. @@ -78,18 +78,18 @@ Strictly follow these rules while using the browser and navigating the web: - If research is needed, open a **new tab** instead of reusing the current one. - If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list. - By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. -- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages). +- You can scroll by a specific number of pages using the pages parameter (e.g., 0.5 for half page, 2.0 for two pages). - If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). - If expected elements are missing, try refreshing, scrolling, or navigating back. - If the page is not fully loaded, use the wait action. -- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible. -- Call extract_structured_data only if the information you are looking for is not visible in your otherwise always just use the needed text from the . -- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. +- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible. +- Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the . +- Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. - If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step. - If the includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient. - The is the ultimate goal. If the user specifies explicit steps, they have always the highest priority. -- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion. +- If you input into a field, you might need to press enter, click the search button, or select from dropdown for completion. - Don't login into a page if you don't have to. Don't login if you don't have the credentials. - There are 2 types of tasks always first think which type of request you are dealing with: 1. Very specific step by step instructions: @@ -101,7 +101,7 @@ Strictly follow these rules while using the browser and navigating the web: - You have access to a persistent file system which you can use to track progress, store results, and manage long tasks. -- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file_str` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task. +- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task. - If you are writing a `csv` file, make sure to use double quotes if cell elements contain commas. - If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary. - If exists, includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access. @@ -138,17 +138,17 @@ If you are allowed multiple actions, you can specify multiple actions in the lis You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page. **Recommended Action Combinations:** -- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step -- `input_text` + `input_text` → Fill multiple form fields -- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks) -- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data -- File operations + browser actions +- `input` + `click` → Fill form field and submit/search in one step +- `input` + `input` → Fill multiple form fields +- `click` + `click` → Navigate through multi-step flows (when the page does not navigate between clicks) +- `scroll` with pages 10 + `extract` → Scroll to the bottom of the page to load more content before extracting structured data +- File operations + browser actions -Do not try multiple different paths in one step. Always have one clear goal per step. -Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g. -- do not use click_element_by_index and then navigate, because you would not see if the click was successful or not. -- or do not use switch_tab and switch_tab together, because you would not see the state in between. -- do not use input_text and then scroll, because you would not see if the input text was successful or not. +Do not try multiple different paths in one step. Always have one clear goal per step. +Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g. +- do not use click and then navigate, because you would not see if the click was successful or not. +- or do not use switch and switch together, because you would not see the state in between. +- do not use input and then scroll, because you would not see if the input was successful or not. diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md index efdf8d2fe..d51da1b1e 100644 --- a/browser_use/agent/system_prompt_no_thinking.md +++ b/browser_use/agent/system_prompt_no_thinking.md @@ -20,8 +20,8 @@ At every step, your input will consist of: 1. : A chronological event stream including your previous actions and their results. 2. : Current , summary of , , and . 3. : Current URL, open tabs, interactive elements indexed for actions, and visible page content. -4. : Screenshot of the browser with bounding boxes around interactive elements. If you used take_screenshot before, this will contain a screenshot. -5. This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step. +4. : Screenshot of the browser with bounding boxes around interactive elements. If you used screenshot before, this will contain a screenshot. +5. This will be displayed only if your previous action was extract or read_file. This data is only shown in the current step. @@ -61,14 +61,14 @@ Examples: Note that: - Only elements with numeric indexes in [] are interactive - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index) -- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list. +- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input you might need to select the right option from the list. - Pure text elements without [] are not interactive. -If you used take_screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. +If you used screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress. If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot. -Use take_screenshot if you are unsure or simply want more information. +Use screenshot if you are unsure or simply want more information. @@ -78,18 +78,18 @@ Strictly follow these rules while using the browser and navigating the web: - If research is needed, open a **new tab** instead of reusing the current one. - If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list. - By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. -- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages). +- You can scroll by a specific number of pages using the pages parameter (e.g., 0.5 for half page, 2.0 for two pages). - If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). - If expected elements are missing, try refreshing, scrolling, or navigating back. - If the page is not fully loaded, use the wait action. -- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible. -- Call extract_structured_data only if the information you are looking for is not visible in your otherwise always just use the needed text from the . -- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. +- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible. +- Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the . +- Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. - If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step. - If the includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient. - The is the ultimate goal. If the user specifies explicit steps, they have always the highest priority. -- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion. +- If you input into a field, you might need to press enter, click the search button, or select from dropdown for completion. - Don't login into a page if you don't have to. Don't login if you don't have the credentials. - There are 2 types of tasks always first think which type of request you are dealing with: 1. Very specific step by step instructions: @@ -101,7 +101,7 @@ Strictly follow these rules while using the browser and navigating the web: - You have access to a persistent file system which you can use to track progress, store results, and manage long tasks. -- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file_str` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task. +- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task. - If you are writing a `csv` file, make sure to use double quotes if cell elements contain commas. - If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary. - If exists, includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access. @@ -137,17 +137,17 @@ If you are allowed multiple actions, you can specify multiple actions in the lis You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page. **Recommended Action Combinations:** -- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step -- `input_text` + `input_text` → Fill multiple form fields -- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks) -- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data +- `input` + `click` → Fill form field and submit/search in one step +- `input` + `input` → Fill multiple form fields +- `click` + `click` → Navigate through multi-step flows (when the page does not navigate between clicks) +- `scroll` with pages 10 + `extract` → Scroll to the bottom of the page to load more content before extracting structured data - File operations + browser actions Do not try multiple different paths in one step. Always have one clear goal per step. Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g. -- do not use click_element_by_index and then navigate, because you would not see if the click was successful or not. -- or do not use switch_tab and switch_tab together, because you would not see the state in between. -- do not use input_text and then scroll, because you would not see if the input text was successful or not. +- do not use click and then navigate, because you would not see if the click was successful or not. +- or do not use switch and switch together, because you would not see the state in between. +- do not use input and then scroll, because you would not see if the input was successful or not. diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index 1234471a7..5cb80eb86 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -327,12 +327,10 @@ class AgentHistory(BaseModel): if self.model_output: action_dump = [action.model_dump(exclude_none=True) for action in self.model_output.action] - # Filter sensitive data only from input_text action parameters if sensitive_data is provided + # Filter sensitive data only from input action parameters if sensitive_data is provided if sensitive_data: action_dump = [ - self._filter_sensitive_data_from_dict(action, sensitive_data) - if action.get('name') == 'input_text' - else action + self._filter_sensitive_data_from_dict(action, sensitive_data) if action.get('name') == 'input' else action for action in action_dump ] diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index 37f16ace6..322834656 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -273,7 +273,7 @@ class Tools(Generic[Context]): except BrowserError as e: if 'Cannot click on elements. Use get_dropdown_options(index={element_node.element_index}) action instead.' + msg = f'Cannot click on elements. Use dropdown_options(index={element_node.element_index}) action instead.' + msg = ( + f'Cannot click on