Merge branch 'main' into oracle_oci_integration

This commit is contained in:
Talapally Sandeep Kumar
2025-10-06 11:37:01 +05:30
committed by GitHub
39 changed files with 481 additions and 635 deletions

4
.gitignore vendored
View File

@@ -63,3 +63,7 @@ screenshot.png
all_github_issues_progress.md
all_github_issues.md
todo-input-token.md
TOOL_CHANGES_SUMMARY.md

View File

@@ -167,7 +167,7 @@ products = await page.extract_content(
### Element Methods (DOM Interactions)
- `click(button='left', click_count=1, modifiers=None)` - Click element with advanced fallbacks
- `fill(text: str, clear_existing=True)` - Fill input with text (clears first by default)
- `fill(text: str, clear=True)` - Fill input with text (clears first by default)
- `hover()` - Hover over element
- `focus()` - Focus the element
- `check()` - Toggle checkbox/radio button (clicks to change state)

View File

@@ -349,7 +349,7 @@ class Element:
# Extract key element info for error message
raise RuntimeError(f'Failed to click element: {e}')
async def fill(self, value: str, clear_existing: bool = True) -> None:
async def fill(self, value: str, clear: bool = True) -> None:
"""Fill the input element using proper CDP methods with improved focus handling."""
try:
# Use the existing CDP client and session
@@ -409,7 +409,7 @@ class Element:
)
# Step 2: Clear existing text if requested
if clear_existing and focused_successfully:
if clear and focused_successfully:
cleared_successfully = await self._clear_text_field(
object_id=object_id, cdp_client=cdp_client, session_id=session_id
)

View File

@@ -285,7 +285,7 @@ class MessageManager:
model_output: AgentOutput | None = None,
result: list[ActionResult] | None = None,
step_info: AgentStepInfo | None = None,
use_vision=True,
use_vision: bool | Literal['auto'] = 'auto',
page_filtered_actions: str | None = None,
sensitive_data=None,
available_file_paths: list[str] | None = None, # Always pass current available_file_paths
@@ -305,11 +305,37 @@ class MessageManager:
self.sensitive_data = effective_sensitive_data
self.sensitive_data_description = self._get_sensitive_data_description(browser_state_summary.url)
# Use only the current screenshot
# Use only the current screenshot, but check if action results request screenshot inclusion
screenshots = []
if browser_state_summary.screenshot:
include_screenshot_requested = False
# Check if any action results request screenshot inclusion
if result:
for action_result in result:
if action_result.metadata and action_result.metadata.get('include_screenshot'):
include_screenshot_requested = True
logger.debug('Screenshot inclusion requested by action result')
break
# Handle different use_vision modes:
# - "auto": Only include screenshot if explicitly requested by action (e.g., screenshot)
# - True: Always include screenshot
# - False: Never include screenshot
include_screenshot = False
if use_vision is True:
# Always include screenshot when use_vision=True
include_screenshot = True
elif use_vision == 'auto':
# Only include screenshot if explicitly requested by action when use_vision="auto"
include_screenshot = include_screenshot_requested
# else: use_vision is False, never include screenshot (include_screenshot stays False)
if include_screenshot and browser_state_summary.screenshot:
screenshots.append(browser_state_summary.screenshot)
# Use vision in the user message if screenshots are included
effective_use_vision = len(screenshots) > 0
# Create single state message with all content
assert browser_state_summary
state_message = AgentMessagePrompt(
@@ -327,7 +353,7 @@ class MessageManager:
vision_detail_level=self.vision_detail_level,
include_recent_events=self.include_recent_events,
sample_images=self.sample_images,
).get_user_message(use_vision)
).get_user_message(effective_use_vision)
# Set the state message with caching enabled
self._set_message_with_type(state_message, 'state')

View File

@@ -265,7 +265,9 @@ class AgentMessagePrompt:
# Check if current page is a PDF viewer and add appropriate message
pdf_message = ''
if self.browser_state.is_pdf_viewer:
pdf_message = 'PDF viewer cannot be rendered. In this page, DO NOT use the extract_structured_data action as PDF content cannot be rendered. '
pdf_message = (
'PDF viewer cannot be rendered. In this page, DO NOT use the extract action as PDF content cannot be rendered. '
)
pdf_message += 'Use the read_file action on the downloaded PDF in available_file_paths to read the full text content or scroll in the page to see images/figures if needed.\n\n'
# Add recent events if available and requested

View File

@@ -154,7 +154,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
register_should_stop_callback: Callable[[], Awaitable[bool]] | None = None,
# Agent settings
output_model_schema: type[AgentStructuredOutput] | None = None,
use_vision: bool = True,
use_vision: bool | Literal['auto'] = 'auto',
save_conversation_path: str | Path | None = None,
save_conversation_path_encoding: str | None = 'utf-8',
max_failures: int = 3,
@@ -255,7 +255,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
elif controller is not None:
self.tools = controller
else:
self.tools = Tools(display_files_in_done_text=display_files_in_done_text)
# Exclude screenshot tool when use_vision=False
exclude_actions = ['screenshot'] if use_vision is False else []
self.tools = Tools(exclude_actions=exclude_actions, display_files_in_done_text=display_files_in_done_text)
# Structured output
self.output_model_schema = output_model_schema
@@ -321,7 +323,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
initial_url = self._extract_url_from_task(self.task)
if initial_url:
self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...')
initial_actions = [{'go_to_url': {'url': initial_url, 'new_tab': False}}]
initial_actions = [{'navigate': {'url': initial_url, 'new_tab': False}}]
self.initial_url = initial_url

View File

@@ -20,8 +20,8 @@ At every step, your input will consist of:
1. <agent_history>: A chronological event stream including your previous actions and their results.
2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements. If you used screenshot before, this will contain a screenshot.
5. <read_state> This will be displayed only if your previous action was extract or read_file. This data is only shown in the current step.
</input>
<agent_history>
@@ -61,13 +61,14 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
</browser_state>
<browser_vision>
You will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
If you used screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot.
Use screenshot if you are unsure or simply want more information.
</browser_vision>
<browser_rules>
@@ -77,18 +78,18 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- You can scroll by a specific number of pages using the pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.
- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract_structured_data only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- If you input into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- Don't login into a page if you don't have to. Don't login if you don't have the credentials.
- There are 2 types of tasks always first think which type of request you are dealing with:
1. Very specific step by step instructions:
@@ -100,7 +101,7 @@ Strictly follow these rules while using the browser and navigating the web:
<file_system>
- You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file_str` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task.
- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task.
- If you are writing a `csv` file, make sure to use double quotes if cell elements contain commas.
- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
@@ -137,17 +138,17 @@ If you are allowed multiple actions, you can specify multiple actions in the lis
You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page.
**Recommended Action Combinations:**
- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step
- `input_text` + `input_text` → Fill multiple form fields
- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data
- File operations + browser actions
- `input` + `click` → Fill form field and submit/search in one step
- `input` + `input` → Fill multiple form fields
- `click` + `click` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with pages 10 + `extract` → Scroll to the bottom of the page to load more content before extracting structured data
- File operations + browser actions
Do not try multiple different paths in one step. Always have one clear goal per step.
Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g.
- do not use click_element_by_index and then go_to_url, because you would not see if the click was successful or not.
- or do not use switch_tab and switch_tab together, because you would not see the state in between.
- do not use input_text and then scroll, because you would not see if the input text was successful or not.
Do not try multiple different paths in one step. Always have one clear goal per step.
Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g.
- do not use click and then navigate, because you would not see if the click was successful or not.
- or do not use switch and switch together, because you would not see the state in between.
- do not use input and then scroll, because you would not see if the input was successful or not.
</efficiency_guidelines>
<reasoning_rules>
@@ -209,7 +210,7 @@ You must ALWAYS respond with a valid JSON in this exact format:
"evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
"memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence."
"action":[{{"go_to_url": {{ "url": "url_value"}}}}, // ... more actions in sequence]
"action":[{{"navigate": {{ "url": "url_value"}}}}, // ... more actions in sequence]
}}
Action list should NEVER be empty.

View File

@@ -1,177 +1,35 @@
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
<intro>
You excel at following tasks:
1. Navigating complex websites and extracting precise information
2. Automating form submissions and interactive web actions
3. Gathering and saving information
4. Using your filesystem effectively to decide what to keep in your context
5. Operate effectively in an agent loop
6. Efficiently performing diverse web tasks
</intro>
<language_settings>
- Default working language: **English**
- Always respond in the same language as the user request
Default: English. Match user's language.
</language_settings>
<input>
At every step, your input will consist of:
1. <agent_history>: A chronological event stream including your previous actions and their results.
2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
</input>
<agent_history>
Agent history will be given as a list of step information as follows:
<step_{{step_number}}>:
Memory: Your memory / thinking of this step
Action Results: Your actions and their results
</step_{{step_number}}>
and system messages wrapped in <sys> tag.
</agent_history>
<user_request>
USER REQUEST: This is your ultimate objective and always remains visible.
- This has the highest priority. Make the user happy.
- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
- If the task is open ended you can plan yourself how to get it done.
Ultimate objective. Specific tasks: follow each step. Open-ended: plan approach.
</user_request>
<browser_state>
1. Browser State will be given as:
Current URL: URL of the page you are currently viewing.
Open Tabs: Open tabs with their ids.
Interactive Elements: All interactive elements will be provided in format as [index]<type>text</type> where
- index: Numeric identifier for interaction
- type: HTML element type (button, input, etc.)
- text: Element description
Examples:
[33]<div>User form</div>
\t*[35]<button aria-label='Submit form'>Submit</button>
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
Elements: [index]<type>text</type>. Only [indexed] are interactive. Indentation=child. *[=new.
</browser_state>
<browser_vision>
You will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot.
</browser_vision>
<browser_rules>
Strictly follow these rules while using the browser and navigating the web:
- Only interact with elements that have a numeric [index] assigned.
- Only use indexes that are explicitly provided.
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.
- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract_structured_data only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- Don't login into a page if you don't have to. Don't login if you don't have the credentials.
- There are 2 types of tasks always first think which type of request you are dealing with:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in <available_file_paths>. You can either read the file or scroll in the page to see more.
</browser_rules>
<file_system>
- You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file_str` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task.
- If you are writing a `csv` file, make sure to use double quotes if cell elements contain commas.
- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
- If the task is really long, initialize a `results.md` file to accumulate your results.
- DO NOT use the file system if the task is less than 10 steps!
- PDFs auto-download to available_file_paths. Read file or scroll viewer.
Persistent file system for progress tracking.
Long tasks <10 steps: use todo.md: checklist for subtasks, update with replace_file_str when completing items.
CSV: use double quotes for commas.
available_file_paths: downloaded/user files (read/upload only).
</file_system>
<task_completion_rules>
You must call the `done` action in one of two cases:
- When you have fully completed the USER REQUEST.
- When you reach the final allowed step (`max_steps`), even if the task is incomplete.
- If it is ABSOLUTELY IMPOSSIBLE to continue.
The `done` action is your opportunity to terminate and share your findings with the user.
- Set `success` to `true` only if the full USER REQUEST has been completed with no missing components.
- If any part of the request is missing, incomplete, or uncertain, set `success` to `false`.
- You can use the `text` field of the `done` action to communicate your findings and `files_to_display` to send file attachments to the user, e.g. `["results.md"]`.
- Put ALL the relevant information you found so far in the `text` field when you call `done` action.
- Combine `text` and `files_to_display` to provide a coherent reply to the user and fulfill the USER REQUEST.
- You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions.
- If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer.
- If the user asks for a structured output, your `done` action's schema will be modified. Take this schema into account when solving the task!
</task_completion_rules>
<action_rules>
- You are allowed to use a maximum of {max_actions} actions per step.
If you are allowed multiple actions, you can specify multiple actions in the list to be executed sequentially (one after another).
- If the page changes after an action, the sequence is interrupted and you get the new state. You can see this in your agent history when this happens.
</action_rules>
<efficiency_guidelines>
You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page.
**Recommended Action Combinations:**
- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step
- `input_text` + `input_text` → Fill multiple form fields
- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data
- File operations + browser actions
Do not try multiple different paths in one step. Always have one clear goal per step.
Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g.
- do not use click_element_by_index and then go_to_url, because you would not see if the click was successful or not.
- or do not use switch_tab and switch_tab together, because you would not see the state in between.
- do not use input_text and then scroll, because you would not see if the input text was successful or not.
</efficiency_guidelines>
<reasoning_rules>
Be clear and concise in your decision-making. Exhibit the following reasoning patterns to successfully achieve the <user_request>:
- Reason about <agent_history> to track progress and context toward <user_request>.
- Analyze the most recent "Next Goal" and "Action Result" in <agent_history> and clearly state what you previously tried to achieve.
- Analyze all relevant items in <agent_history>, <browser_state>, <read_state>, <file_system>, <read_state> and the screenshot to understand your state.
- Explicitly judge success/failure/uncertainty of the last action. Never assume an action succeeded just because it appears to be executed in your last step in <agent_history>. For example, you might have "Action 1/1: Input '2025-05-05' into element 3." in your history even though inputting text failed. Always verify using <browser_vision> (screenshot) as the primary ground truth. If a screenshot is unavailable, fall back to <browser_state>. If the expected change is missing, mark the last action as failed (or uncertain) and plan a recovery.
- If todo.md is empty and the task is multi-step, generate a stepwise plan in todo.md using file tools.
- Analyze `todo.md` to guide and track your progress.
- If any todo.md items are finished, mark them as complete in the file.
- Analyze whether you are stuck, e.g. when you repeat the same actions multiple times without any progress. Then consider alternative approaches e.g. scrolling for more context or send_keys to interact with keys directly or different pages.
- Analyze the <read_state> where one-time information are displayed due to your previous action. Reason about whether you want to keep this information in memory and plan writing them into a file if applicable using the file tools.
- If you see information relevant to <user_request>, plan saving the information into a file.
- Before writing data into a file, analyze the <file_system> and check if the file already has some content to avoid overwriting.
- Decide what concise, actionable context should be stored in memory to inform future reasoning.
- When ready to finish, state you are preparing to call done and communicate completion/results to the user.
- Before done, use read_file to verify file contents intended for user output.
- Always reason about the <user_request>. Make sure to carefully analyze the specific steps and information required. E.g. specific filters, specific form fields, specific information to search. Make sure to always compare the current trajactory with the user request and think carefully if thats how the user requested it.
</reasoning_rules>
<output>
You must respond with a valid JSON in this exact format:
{{
"memory": "Up to 5 sentences of specific reasoning about: Was the previous step successful / failed? What do we need to remember from the current state for the task? Plan ahead what are the best next actions. What's the next immediate goal? Depending on the complexity think longer. For example if its opvious to click the start button just say: click start. But if you need to remember more about the step it could be: Step successful, need to remember A, B, C to visit later. Next click on A.",
"action":[{{"go_to_url": {{ "url": "url_value"}}}}]
"action":[{{"navigate": {{ "url": "url_value"}}}}]
}}
Action list should NEVER be empty.
</output>

View File

@@ -20,8 +20,8 @@ At every step, your input will consist of:
1. <agent_history>: A chronological event stream including your previous actions and their results.
2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements. If you used screenshot before, this will contain a screenshot.
5. <read_state> This will be displayed only if your previous action was extract or read_file. This data is only shown in the current step.
</input>
<agent_history>
@@ -61,13 +61,14 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
</browser_state>
<browser_vision>
You will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
If you used screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
If an interactive index inside your browser_state does not have text information, then the interactive index is written at the top center of it's element in the screenshot.
Use screenshot if you are unsure or simply want more information.
</browser_vision>
<browser_rules>
@@ -77,18 +78,18 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- You can scroll by a specific number of pages using the pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.
- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract_structured_data only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- If you input into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- Don't login into a page if you don't have to. Don't login if you don't have the credentials.
- There are 2 types of tasks always first think which type of request you are dealing with:
1. Very specific step by step instructions:
@@ -100,7 +101,7 @@ Strictly follow these rules while using the browser and navigating the web:
<file_system>
- You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file_str` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task.
- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Use `replace_file` tool to update markers in `todo.md` as first action whenever you complete an item. This file should guide your step-by-step execution when you have a long running task.
- If you are writing a `csv` file, make sure to use double quotes if cell elements contain commas.
- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
@@ -136,17 +137,17 @@ If you are allowed multiple actions, you can specify multiple actions in the lis
You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page.
**Recommended Action Combinations:**
- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step
- `input_text` + `input_text` → Fill multiple form fields
- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data
- `input` + `click` → Fill form field and submit/search in one step
- `input` + `input` → Fill multiple form fields
- `click` + `click` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with pages 10 + `extract` → Scroll to the bottom of the page to load more content before extracting structured data
- File operations + browser actions
Do not try multiple different paths in one step. Always have one clear goal per step.
Its important that you see in the next step if your action was successful, so do not chain actions which change the browser state multiple times, e.g.
- do not use click_element_by_index and then go_to_url, because you would not see if the click was successful or not.
- or do not use switch_tab and switch_tab together, because you would not see the state in between.
- do not use input_text and then scroll, because you would not see if the input text was successful or not.
- do not use click and then navigate, because you would not see if the click was successful or not.
- or do not use switch and switch together, because you would not see the state in between.
- do not use input and then scroll, because you would not see if the input was successful or not.
</efficiency_guidelines>
<reasoning_rules>
@@ -205,7 +206,7 @@ You must ALWAYS respond with a valid JSON in this exact format:
"evaluation_previous_goal": "One-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
"memory": "1-3 sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence.",
"action":[{{"go_to_url": {{ "url": "url_value"}}}}, // ... more actions in sequence]
"action":[{{"navigate": {{ "url": "url_value"}}}}, // ... more actions in sequence]
}}
Action list should NEVER be empty.

View File

@@ -33,7 +33,7 @@ logger = logging.getLogger(__name__)
class AgentSettings(BaseModel):
"""Configuration options for the Agent"""
use_vision: bool = True
use_vision: bool | Literal['auto'] = 'auto'
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto'
save_conversation_path: str | Path | None = None
save_conversation_path_encoding: str | None = 'utf-8'
@@ -155,7 +155,6 @@ class AgentOutput(BaseModel):
next_goal: str | None = None
action: list[ActionModel] = Field(
...,
description='List of actions to execute',
json_schema_extra={'min_items': 1}, # Ensure at least one action is provided
)
@@ -188,7 +187,6 @@ class AgentOutput(BaseModel):
),
__module__=AgentOutput.__module__,
)
model_.__doc__ = 'AgentOutput model with custom actions'
return model_
@staticmethod
@@ -208,12 +206,11 @@ class AgentOutput(BaseModel):
__base__=AgentOutputNoThinking,
action=(
list[custom_actions], # type: ignore
Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}),
Field(..., json_schema_extra={'min_items': 1}),
),
__module__=AgentOutputNoThinking.__module__,
)
model.__doc__ = 'AgentOutput model with custom actions'
return model
@staticmethod
@@ -237,12 +234,11 @@ class AgentOutput(BaseModel):
__base__=AgentOutputFlashMode,
action=(
list[custom_actions], # type: ignore
Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}),
Field(..., json_schema_extra={'min_items': 1}),
),
__module__=AgentOutputFlashMode.__module__,
)
model.__doc__ = 'AgentOutput model with custom actions'
return model
@@ -331,12 +327,10 @@ class AgentHistory(BaseModel):
if self.model_output:
action_dump = [action.model_dump(exclude_none=True) for action in self.model_output.action]
# Filter sensitive data only from input_text action parameters if sensitive_data is provided
# Filter sensitive data only from input action parameters if sensitive_data is provided
if sensitive_data:
action_dump = [
self._filter_sensitive_data_from_dict(action, sensitive_data)
if action.get('name') == 'input_text'
else action
self._filter_sensitive_data_from_dict(action, sensitive_data) if 'input' in action else action
for action in action_dump
]

View File

@@ -130,7 +130,7 @@ class ClickElementEvent(ElementSelectedEvent[dict[str, Any] | None]):
button: Literal['left', 'right', 'middle'] = 'left'
while_holding_ctrl: bool = Field(
default=False,
description='Set True to open any link clicked in a new tab in the background, can use switch_tab(tab_id=None) after to focus it',
description='Set True to open any link clicked in a new tab in the background, can use switch(tab_id=None) after to focus it',
)
# click_count: int = 1 # TODO
# expect_download: bool = False # moved to downloads_watchdog.py
@@ -143,7 +143,7 @@ class TypeTextEvent(ElementSelectedEvent[dict | None]):
node: 'EnhancedDOMTreeNode'
text: str
clear_existing: bool = True
clear: bool = True
is_sensitive: bool = False # Flag to indicate if text contains sensitive data
sensitive_key_name: str | None = None # Name of the sensitive key being typed (e.g., 'username', 'password')

View File

@@ -103,7 +103,7 @@ class DefaultActionWatchdog(BaseWatchdog):
# so we need to switch to the new tab to make the agent aware of the surprise new tab that was opened.
# when while_holding_ctrl=True we dont actually want to switch to it,
# we should match human expectations of ctrl+click which opens in the background,
# so in multi_act it usually already sends [click_element_by_index(123, while_holding_ctrl=True), switch_tab(tab_id=None)] anyway
# so in multi_act it usually already sends [click_element_by_index(123, while_holding_ctrl=True), switch(tab_id=None)] anyway
from browser_use.browser.events import SwitchTabEvent
new_target_id = new_target_ids.pop()
@@ -144,7 +144,7 @@ class DefaultActionWatchdog(BaseWatchdog):
input_metadata = await self._input_text_element_node_impl(
element_node,
event.text,
clear_existing=event.clear_existing or (not event.text),
clear=event.clear or (not event.text),
is_sensitive=event.is_sensitive,
)
# Log with sensitive data protection
@@ -258,7 +258,9 @@ class DefaultActionWatchdog(BaseWatchdog):
element_type = element_node.attributes.get('type', '').lower() if element_node.attributes else ''
if tag_name == 'select':
msg = f'Cannot click on <select> elements. Use get_dropdown_options(index={element_node.element_index}) action instead.'
msg = (
f'Cannot click on <select> elements. Use dropdown_options(index={element_node.element_index}) action instead.'
)
self.logger.warning(msg)
raise BrowserError(
message=msg,
@@ -992,7 +994,7 @@ class DefaultActionWatchdog(BaseWatchdog):
return False
async def _input_text_element_node_impl(
self, element_node: EnhancedDOMTreeNode, text: str, clear_existing: bool = True, is_sensitive: bool = False
self, element_node: EnhancedDOMTreeNode, text: str, clear: bool = True, is_sensitive: bool = False
) -> dict | None:
"""
Input text into an element using pure CDP with improved focus fallbacks.
@@ -1055,7 +1057,7 @@ class DefaultActionWatchdog(BaseWatchdog):
)
# Step 2: Clear existing text if requested
if clear_existing and focused_successfully:
if clear and focused_successfully:
cleared_successfully = await self._clear_text_field(object_id=object_id, cdp_session=cdp_session)
if not cleared_successfully:
self.logger.warning('⚠️ Text field clearing failed, typing may append to existing text')
@@ -2001,7 +2003,9 @@ class DefaultActionWatchdog(BaseWatchdog):
msg = f'Found {dropdown_type} dropdown ({element_info}):\n' + '\n'.join(formatted_options)
else:
msg = f'Found {dropdown_type} dropdown in {source_info} ({element_info}):\n' + '\n'.join(formatted_options)
msg += f'\n\nUse the exact text or value string (without quotes) in select_dropdown_option(index={index_for_logging}, text=...)'
msg += (
f'\n\nUse the exact text or value string (without quotes) in select_dropdown(index={index_for_logging}, text=...)'
)
if source_info == 'target':
self.logger.info(f'📋 Found {len(dropdown_data["options"])} dropdown options for index {index_for_logging}')

View File

@@ -677,7 +677,7 @@ class DOMTreeSerializer:
# 5. Keep if has role suggesting interactivity
if node.original_node.attributes:
role = node.original_node.attributes.get('role')
if role in ['button', 'link', 'checkbox', 'radio', 'tab', 'menuitem']:
if role in ['button', 'link', 'checkbox', 'radio', 'tab', 'menuitem', 'option']:
return False
# Default: exclude this child

View File

@@ -255,7 +255,7 @@ class ChatGoogle(BaseChatModel):
self.logger.debug(f'🔧 Requesting structured output for {output_format.__name__}')
config['response_mime_type'] = 'application/json'
# Convert Pydantic model to Gemini-compatible schema
optimized_schema = SchemaOptimizer.create_optimized_json_schema(output_format)
optimized_schema = SchemaOptimizer.create_gemini_optimized_schema(output_format)
gemini_schema = self._fix_gemini_schema(optimized_schema)
config['response_schema'] = gemini_schema

View File

@@ -48,9 +48,10 @@ class SchemaOptimizer:
if key == 'title' and not in_properties:
continue
# Preserve FULL descriptions without truncation
# Preserve FULL descriptions without truncation, skip empty ones
elif key == 'description':
optimized[key] = value
if value: # Only include non-empty descriptions
optimized[key] = value
# Handle type field
elif key == 'type':
@@ -159,3 +160,31 @@ class SchemaOptimizer:
elif isinstance(schema, list):
for item in schema:
SchemaOptimizer._make_strict_compatible(item)
@staticmethod
def create_gemini_optimized_schema(model: type[BaseModel]) -> dict[str, Any]:
"""
Create Gemini-optimized schema that removes 'required' arrays to save tokens.
Gemini can infer required fields from context since all fields are required.
Args:
model: The Pydantic model to optimize
Returns:
Optimized schema without required arrays
"""
# Start with standard optimized schema
schema = SchemaOptimizer.create_optimized_json_schema(model)
def remove_required_arrays(obj: Any) -> Any:
"""Recursively remove 'required' arrays"""
if isinstance(obj, dict):
# Remove 'required' key
result = {k: v for k, v in obj.items() if k != 'required'}
# Recursively process nested structures
return {k: remove_required_arrays(v) for k, v in result.items()}
elif isinstance(obj, list):
return [remove_required_arrays(item) for item in obj]
return obj
return remove_required_arrays(schema)

View File

@@ -811,7 +811,7 @@ class BrowserUseServer:
state = await self.browser_session.get_browser_state_summary()
# Use the extract_structured_data action
# Use the extract action
# Create a dynamic action model that matches the tools's expectations
from pydantic import create_model
@@ -819,13 +819,13 @@ class BrowserUseServer:
ExtractAction = create_model(
'ExtractAction',
__base__=ActionModel,
extract_structured_data=dict[str, Any],
extract=dict[str, Any],
)
# Use model_validate because Pyright does not understand the dynamic model
action = ExtractAction.model_validate(
{
'extract_structured_data': {'query': query, 'extract_links': extract_links},
'extract': {'query': query, 'extract_links': extract_links},
}
)
action_result = await self.tools.act(

View File

@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import asdict, dataclass
from typing import Any
from typing import Any, Literal
from browser_use.config import is_running_in_docker
@@ -29,7 +29,7 @@ class AgentTelemetryEvent(BaseTelemetryEvent):
model_provider: str
max_steps: int
max_actions_per_step: int
use_vision: bool
use_vision: bool | Literal['auto']
version: str
source: str
cdp_url: str | None

View File

@@ -350,12 +350,12 @@ class Registry(Generic[Context]):
'browser_session': browser_session,
'page_extraction_llm': page_extraction_llm,
'available_file_paths': available_file_paths,
'has_sensitive_data': action_name == 'input_text' and bool(sensitive_data),
'has_sensitive_data': action_name == 'input' and bool(sensitive_data),
'file_system': file_system,
}
# Only pass sensitive_data to actions that explicitly need it (input_text)
if action_name == 'input_text':
# Only pass sensitive_data to actions that explicitly need it (input)
if action_name == 'input':
special_context['sensitive_data'] = sensitive_data
# Add CDP-related parameters if browser_session is available
@@ -538,8 +538,6 @@ class Registry(Generic[Context]):
union_type = Union[tuple(individual_action_models)] # type: ignore : Typing doesn't understand that the length is >= 2 (by design)
class ActionModelUnion(RootModel[union_type]): # type: ignore
"""Union of all available action models that maintains ActionModel interface"""
def get_index(self) -> int | None:
"""Delegate get_index to the underlying action model"""
if hasattr(self.root, 'get_index'):

View File

@@ -38,8 +38,8 @@ from browser_use.tools.views import (
CloseTabAction,
DoneAction,
GetDropdownOptionsAction,
GoToUrlAction,
InputTextAction,
NavigateAction,
NoParamsAction,
ScrollAction,
SearchAction,
@@ -115,7 +115,7 @@ class Tools(Generic[Context]):
# Basic Navigation Actions
@self.registry.action(
'Search a query with search engine which defaults to DuckDuckGo. Dont specify search_engine unless user asks for different search engine. Available search engines: duckduckgo, google, bing.',
'',
param_model=SearchAction,
)
async def search(params: SearchAction, browser_session: BrowserSession):
@@ -131,10 +131,10 @@ class Tools(Generic[Context]):
'bing': f'https://www.bing.com/search?q={encoded_query}',
}
if params.search_engine.lower() not in search_engines:
return ActionResult(error=f'Unsupported search engine: {params.search_engine}. Options: duckduckgo, google, bing')
if params.engine.lower() not in search_engines:
return ActionResult(error=f'Unsupported search engine: {params.engine}. Options: duckduckgo, google, bing')
search_url = search_engines[params.search_engine.lower()]
search_url = search_engines[params.engine.lower()]
# Simple tab logic: use current tab by default
use_new_tab = False
@@ -149,19 +149,19 @@ class Tools(Generic[Context]):
)
await event
await event.event_result(raise_if_any=True, raise_if_none=False)
memory = f"Searched {params.search_engine.title()} for '{params.query}'"
memory = f"Searched {params.engine.title()} for '{params.query}'"
msg = f'🔍 {memory}'
logger.info(msg)
return ActionResult(extracted_content=memory, long_term_memory=memory)
except Exception as e:
logger.error(f'Failed to search {params.search_engine}: {e}')
return ActionResult(error=f'Failed to search {params.search_engine} for "{params.query}": {str(e)}')
logger.error(f'Failed to search {params.engine}: {e}')
return ActionResult(error=f'Failed to search {params.engine} for "{params.query}": {str(e)}')
@self.registry.action(
'Navigate to URL, optionally set new_tab=True to open in new tab, otherwise default is False.',
param_model=GoToUrlAction,
'',
param_model=NavigateAction,
)
async def go_to_url(params: GoToUrlAction, browser_session: BrowserSession):
async def navigate(params: NavigateAction, browser_session: BrowserSession):
try:
# Dispatch navigation event
event = browser_session.event_bus.dispatch(NavigateToUrlEvent(url=params.url, new_tab=params.new_tab))
@@ -204,7 +204,7 @@ class Tools(Generic[Context]):
# Return error in ActionResult instead of re-raising
return ActionResult(error=f'Navigation failed: {str(e)}')
@self.registry.action('Go back', param_model=NoParamsAction)
@self.registry.action('', param_model=NoParamsAction)
async def go_back(_: NoParamsAction, browser_session: BrowserSession):
try:
event = browser_session.event_bus.dispatch(GoBackEvent())
@@ -218,9 +218,7 @@ class Tools(Generic[Context]):
error_msg = f'Failed to go back: {str(e)}'
return ActionResult(error=error_msg)
@self.registry.action(
'Wait for x seconds (default 3) (max 30 seconds). This can be used to wait until the page is fully loaded.'
)
@self.registry.action('')
async def wait(seconds: int = 3):
# Cap wait time at maximum 30 seconds
# Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds
@@ -236,7 +234,7 @@ class Tools(Generic[Context]):
# Element Interaction Actions
@self.registry.action(
'Click an element by index. Only indices from your browser_state are allowed. Never use an index that is not inside your current browser_state. Optionally set ctrl=True to open any resulting navigation in a new tab.',
'',
param_model=ClickElementAction,
)
async def click(params: ClickElementAction, browser_session: BrowserSession):
@@ -275,7 +273,7 @@ class Tools(Generic[Context]):
except BrowserError as e:
if 'Cannot click on <select> elements.' in str(e):
try:
return await get_dropdown_options(
return await dropdown_options(
params=GetDropdownOptionsAction(index=params.index), browser_session=browser_session
)
except Exception as dropdown_error:
@@ -290,10 +288,10 @@ class Tools(Generic[Context]):
return ActionResult(error=error_msg)
@self.registry.action(
'Input text into an input interactive element. Only input text into indices that are inside your current browser_state and are valid input fields.',
'',
param_model=InputTextAction,
)
async def input_text(
async def input(
params: InputTextAction,
browser_session: BrowserSession,
has_sensitive_data: bool = False,
@@ -315,7 +313,7 @@ class Tools(Generic[Context]):
TypeTextEvent(
node=node,
text=params.text,
clear_existing=params.clear_existing,
clear=params.clear,
is_sensitive=has_sensitive_data,
sensitive_key_name=sensitive_key_name,
)
@@ -352,7 +350,7 @@ class Tools(Generic[Context]):
return ActionResult(error=error_msg)
@self.registry.action(
'Upload file to interactive element with file path. Only upload files to indices that are inside your current browser_state and are valid file upload fields.',
'',
param_model=UploadFileAction,
)
async def upload_file(
@@ -503,8 +501,8 @@ class Tools(Generic[Context]):
# Tab Management Actions
@self.registry.action('Switch to tab with tab_id.', param_model=SwitchTabAction)
async def switch_tab(params: SwitchTabAction, browser_session: BrowserSession):
@self.registry.action('', param_model=SwitchTabAction)
async def switch(params: SwitchTabAction, browser_session: BrowserSession):
# Simple switch tab logic
try:
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
@@ -525,8 +523,8 @@ class Tools(Generic[Context]):
memory = f'Attempted to switch to tab #{params.tab_id}'
return ActionResult(extracted_content=memory, long_term_memory=memory)
@self.registry.action('Close an existing tab', param_model=CloseTabAction)
async def close_tab(params: CloseTabAction, browser_session: BrowserSession):
@self.registry.action('', param_model=CloseTabAction)
async def close(params: CloseTabAction, browser_session: BrowserSession):
# Simple close tab logic
try:
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
@@ -557,27 +555,14 @@ class Tools(Generic[Context]):
# This action is temporarily disabled as it needs refactoring to use events
@self.registry.action(
"""This tool sends the markdown of the current page with the query to an LLM to extract structured, semantic data (e.g. product description, price, all information about XYZ) from the markdown of the current webpage based on a query.
Only use when:
- You are sure that you are on the right page for the query
- You know exactly the information you need to extract from the page
- You did not previously call this tool on the same page
You can not use this tool to:
- Get interactive elements like buttons, links, dropdowns, menus, etc.
- If you previously asked extract_structured_data on the same page with the same query, you should not call it again.
Set extract_links=True only if your query requires extracting links/URLs from the page.
Use start_from_char to start extraction from a specific character position (use if extraction was previously truncated and you want more content).
If this tool does not return the desired outcome, do not call it again, use scroll_to_text or scroll to find the desired information.
""",
"""LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Use start_from_char if truncated. If fails, use find_text/scroll instead.""",
)
async def extract_structured_data(
async def extract(
query: str,
extract_links: bool,
browser_session: BrowserSession,
page_extraction_llm: BaseChatModel,
file_system: FileSystem,
extract_links: bool = False,
start_from_char: int = 0,
):
# Constants
@@ -690,11 +675,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
raise RuntimeError(str(e))
@self.registry.action(
"""Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 10.0 for ten pages, etc.).
Default behavior is to scroll by one page. This is enough for most cases.
Optionally, if there are multiple scroll containers, use frame_element_index parameter with an element inside the container you want to scroll in. For that you must use indices that exist in your browser_state (works well for dropdowns and custom UI components).
If you need to get to the bottom of the page, use a high number of pages at once like 10 to get to the bottom of the page.
Note: For multiple pages (>=1.0), scrolls are performed one page at a time to ensure reliability. Page height is detected from viewport, fallback is 1000px per page.""",
"""Scroll by pages (down=True/False, pages=0.5-10.0, default 1.0). Use index for scroll containers (dropdowns/custom UI). High pages (10) reaches bottom. Multi-page scrolls sequentially. Viewport-based height, fallback 1000px/page.""",
param_model=ScrollAction,
)
async def scroll(params: ScrollAction, browser_session: BrowserSession):
@@ -702,19 +683,15 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
# Look up the node from the selector map if index is provided
# Special case: index 0 means scroll the whole page (root/body element)
node = None
if params.frame_element_index is not None and params.frame_element_index != 0:
node = await browser_session.get_element_by_index(params.frame_element_index)
if params.index is not None and params.index != 0:
node = await browser_session.get_element_by_index(params.index)
if node is None:
# Element does not exist
msg = f'Element index {params.frame_element_index} not found in browser state'
msg = f'Element index {params.index} not found in browser state'
return ActionResult(error=msg)
direction = 'down' if params.down else 'up'
target = (
'the page'
if params.frame_element_index is None or params.frame_element_index == 0
else f'element {params.frame_element_index}'
)
target = 'the page' if params.index is None or params.index == 0 else f'element {params.index}'
# Get actual viewport height for more accurate scrolling
try:
@@ -734,11 +711,11 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
logger.debug(f'Failed to get viewport height, using fallback 1000px: {e}')
# For multiple pages (>=1.0), scroll one page at a time to ensure each scroll completes
if params.num_pages >= 1.0:
if params.pages >= 1.0:
import asyncio
num_full_pages = int(params.num_pages)
remaining_fraction = params.num_pages - num_full_pages
num_full_pages = int(params.pages)
remaining_fraction = params.pages - num_full_pages
completed_scrolls = 0
@@ -780,19 +757,19 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
except Exception as e:
logger.warning(f'Fractional scroll failed: {e}')
if params.num_pages == 1.0:
if params.pages == 1.0:
long_term_memory = f'Scrolled {direction} {target} by one page ({viewport_height}px)'
else:
long_term_memory = f'Scrolled {direction} {target} by {completed_scrolls:.1f} pages (requested: {params.num_pages}, {viewport_height}px per page)'
long_term_memory = f'Scrolled {direction} {target} by {completed_scrolls:.1f} pages (requested: {params.pages}, {viewport_height}px per page)'
else:
# For fractional pages <1.0, do single scroll
pixels = int(params.num_pages * viewport_height)
pixels = int(params.pages * viewport_height)
event = browser_session.event_bus.dispatch(
ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node)
)
await event
await event.event_result(raise_if_any=True, raise_if_none=False)
long_term_memory = f'Scrolled {direction} {target} by {params.num_pages} pages ({viewport_height}px per page)'
long_term_memory = f'Scrolled {direction} {target} by {params.pages} pages ({viewport_height}px per page)'
msg = f'🔍 {long_term_memory}'
logger.info(msg)
@@ -803,7 +780,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
return ActionResult(error=error_msg)
@self.registry.action(
'Send strings of special keys to use e.g. Escape, Backspace, Insert, PageDown, Delete, Enter, or Shortcuts such as `Control+o`, `Control+Shift+T`',
'',
param_model=SendKeysAction,
)
async def send_keys(params: SendKeysAction, browser_session: BrowserSession):
@@ -821,10 +798,8 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
error_msg = f'Failed to send keys: {str(e)}'
return ActionResult(error=error_msg)
@self.registry.action(
description='Scroll to a text in the current page. This helps you to be efficient. Prefer this tool over scrolling step by step if you know what to scroll to.',
)
async def scroll_to_text(text: str, browser_session: BrowserSession): # type: ignore
@self.registry.action('')
async def find_text(text: str, browser_session: BrowserSession): # type: ignore
# Dispatch scroll to text event
event = browser_session.event_bus.dispatch(ScrollToTextEvent(text=text))
@@ -844,13 +819,26 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
long_term_memory=f"Tried scrolling to text '{text}' but it was not found",
)
@self.registry.action('')
async def screenshot():
"""Request that a screenshot be included in the next observation"""
memory = 'Requested screenshot for next observation'
msg = f'📸 {memory}'
logger.info(msg)
# Return flag in metadata to signal that screenshot should be included
return ActionResult(
extracted_content=memory,
metadata={'include_screenshot': True},
)
# Dropdown Actions
@self.registry.action(
'Get list of values for a dropdown input field. Only works on dropdown-style form elements (<select>, Semantic UI/aria-labeled select, etc.). Do not use this tool for none dropdown elements.',
'',
param_model=GetDropdownOptionsAction,
)
async def get_dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession):
async def dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession):
"""Get all options from a native dropdown or ARIA menu"""
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
@@ -873,10 +861,10 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
)
@self.registry.action(
'Select dropdown option by exact text from any dropdown type (native <select>, ARIA menus, or custom dropdowns). Searches target element and children to find selectable options.',
'',
param_model=SelectDropdownOptionAction,
)
async def select_dropdown_option(params: SelectDropdownOptionAction, browser_session: BrowserSession):
async def select_dropdown(params: SelectDropdownOptionAction, browser_session: BrowserSession):
"""Select dropdown option by the text of the option you want to select"""
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
@@ -916,9 +904,7 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
return ActionResult(error=error_msg)
# File System Actions
@self.registry.action(
'Write or append content to file_name in file system. Allowed extensions are .md, .txt, .json, .csv, .pdf. For .pdf files, write the content in markdown format and it will automatically be converted to a properly formatted PDF document.'
)
@self.registry.action('')
async def write_file(
file_name: str,
content: str,
@@ -938,15 +924,13 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, long_term_memory=result)
@self.registry.action(
'Replace old_str with new_str in file_name. old_str must exactly match the string to replace in original text. Recommended tool to mark completed items in todo.md or change specific contents in a file.'
)
async def replace_file_str(file_name: str, old_str: str, new_str: str, file_system: FileSystem):
@self.registry.action('')
async def replace_file(file_name: str, old_str: str, new_str: str, file_system: FileSystem):
result = await file_system.replace_file_str(file_name, old_str, new_str)
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, long_term_memory=result)
@self.registry.action('Read file_name from file system')
@self.registry.action('')
async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem):
if available_file_paths and file_name in available_file_paths:
result = await file_system.read_file(file_name, external_file=True)
@@ -976,60 +960,9 @@ Note: For multiple pages (>=1.0), scrolls are performed one page at a time to en
)
@self.registry.action(
"""This JavaScript code gets executed with Runtime.evaluate and 'returnByValue': True, 'awaitPromise': True
SYNTAX RULES - FAILURE TO FOLLOW CAUSES "Uncaught at line 0" ERRORS:
- ALWAYS wrap your code in IIFE: (function(){ ... })() or (async function(){ ... })() for async code
- ALWAYS add try-catch blocks to prevent execution errors
- ALWAYS use proper semicolons and valid JavaScript syntax
- NEVER write multiline code without proper IIFE wrapping
- ALWAYS validate elements exist before accessing them
EXAMPLES:
Use this tool when other tools do not work on the first try as expected or when a more general tool is needed, e.g. for filling a form all at once, hovering, dragging, extracting only links, extracting content from the page, press and hold, hovering, clicking on coordinates, zooming, use this if the user provides custom selectors which you can otherwise not interact with ....
You can also use it to explore the website.
- Write code to solve problems you could not solve with other tools.
- Don't write comments in here, no human reads that.
- Write only valid js code.
- use this to e.g. extract + filter links, convert the page to json into the format you need etc...
- limit the output otherwise your context will explode
- think if you deal with special elements like iframes / shadow roots etc
- Adopt your strategy for React Native Web, React, Angular, Vue, MUI pages etc.
- e.g. with synthetic events, keyboard simulation, shadow DOM, etc.
PROPER SYNTAX EXAMPLES:
CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })()
CORRECT: (async function(){ try { await new Promise(r => setTimeout(r, 100)); return 'done'; } catch(e) { return 'Error: ' + e.message; } })()
WRONG: const el = document.querySelector('#id'); el ? el.value : '';
WRONG: document.querySelector('#id').value
WRONG: Multiline code without IIFE wrapping
SHADOW DOM ACCESS EXAMPLE:
(function(){
try {
const hosts = document.querySelectorAll('*');
for (let host of hosts) {
if (host.shadowRoot) {
const el = host.shadowRoot.querySelector('#target');
if (el) return el.textContent;
}
}
return 'Not found';
} catch(e) {
return 'Error: ' + e.message;
}
})()
## Return values:
- Async functions (with await, promises, timeouts) are automatically handled
- Returns strings, numbers, booleans, and serialized objects/arrays
- Use JSON.stringify() for complex objects: JSON.stringify(Array.from(document.querySelectorAll('a')).map(el => el.textContent.trim()))
""",
"""Execute JS. MUST: wrap in IIFE (function(){...})() or (async function(){...})(), add try-catch, validate elements exist. Check null before accessing properties. Use for: hover, drag, custom selectors, forms, extract/filter links, iframes, shadow DOM, React/Vue/Angular. Limit output. Examples: (function(){try{const el=document.querySelector('#id');return el?el.value:'not found'}catch(e){return 'Error: '+e.message}})() ✓ | document.querySelector('#id').value ✗. Shadow: iterate hosts, check shadowRoot. Return JSON.stringify() for objects. Do not use comments""",
)
async def execute_js(code: str, browser_session: BrowserSession):
async def evaluate(code: str, browser_session: BrowserSession):
# Execute JavaScript with proper error handling and promise support
cdp_session = await browser_session.get_or_create_cdp_session()
@@ -1192,7 +1125,7 @@ SHADOW DOM ACCESS EXAMPLE:
self.display_files_in_done_text = display_files_in_done_text
@self.registry.action(
'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached',
'Complete task with structured output.',
param_model=StructuredOutputAction[output_model],
)
async def done(params: StructuredOutputAction):
@@ -1214,7 +1147,7 @@ SHADOW DOM ACCESS EXAMPLE:
else:
@self.registry.action(
'Complete task - provide a summary of results for the user. Set success=True if task completed successfully, false otherwise. Text should be your response to the user summarizing results. Include files you would like to display to the user in files_to_display.',
'Complete task.',
param_model=DoneAction,
)
async def done(params: DoneAction, file_system: FileSystem):

View File

@@ -6,70 +6,70 @@ from pydantic import BaseModel, ConfigDict, Field
# Action Input Models
class SearchAction(BaseModel):
query: str
search_engine: str = 'duckduckgo' # Options: 'duckduckgo', 'google', 'bing'
engine: str = Field(
default='duckduckgo', description='duckduckgo, google, bing (use duckduckgo by default because less captchas)'
)
# Backward compatibility alias
SearchAction = SearchAction
class GoToUrlAction(BaseModel):
class NavigateAction(BaseModel):
url: str
new_tab: bool = False # True to open in new tab, False to navigate in current tab
new_tab: bool = Field(default=False)
# Backward compatibility alias
GoToUrlAction = NavigateAction
class ClickElementAction(BaseModel):
index: int = Field(ge=1, description='index of the element to click')
index: int = Field(ge=1, description='from browser_state')
ctrl: bool | None = Field(
default=None,
description='Set to True to open the navigation in a new background tab (Ctrl+Click behavior). Optional.',
description='True=New background tab (Ctrl+Click)',
)
# expect_download: bool = Field(default=False, description='set True if expecting a download, False otherwise') # moved to downloads_watchdog.py
# click_count: int = 1 # TODO
class InputTextAction(BaseModel):
index: int = Field(ge=0, description='index of the element to input text into, 0 is the page')
index: int = Field(ge=0, description='from browser_state')
text: str
clear_existing: bool = Field(default=True, description='set True to clear existing text, False to append to existing text')
clear: bool = Field(default=True, description='1=clear, 0=append')
class DoneAction(BaseModel):
text: str
success: bool
files_to_display: list[str] | None = []
text: str = Field(description='summary for user')
success: bool = Field(description='True if user_request completed successfully')
files_to_display: list[str] | None = Field(default=[])
T = TypeVar('T', bound=BaseModel)
class StructuredOutputAction(BaseModel, Generic[T]):
success: bool = True
success: bool = Field(default=True, description='1=done')
data: T
class SwitchTabAction(BaseModel):
tab_id: str = Field(
min_length=4,
max_length=4,
description="tab_id to switch to which is displayed as 'Tab <tab_id>' in the browser_state.",
) # last 4 chars of TargetID
tab_id: str = Field(min_length=4, max_length=4, description='4-char id')
class CloseTabAction(BaseModel):
tab_id: str = Field(
min_length=4, max_length=4, description="tab_id to close which is displayed as 'Tab <tab_id>' in the browser_state."
) # last 4 chars of TargetID
tab_id: str = Field(min_length=4, max_length=4, description='4-char id')
class ScrollAction(BaseModel):
down: bool # True to scroll down, False to scroll up
num_pages: float = 1.0 # Number of pages to scroll (0.5 = half page, 1.0 = one page, etc.)
frame_element_index: int | None = None # Optional element index to find scroll container for
down: bool = Field(description='1=down, 0=up')
pages: float = Field(default=1.0, description='0.5=half, 1=pg, 10=bottom')
index: int | None = Field(default=None, description='Use to scroll in specific container with that element')
class SendKeysAction(BaseModel):
keys: str
keys: str = Field(description='keys (Escape, Enter, PageDown) or shortcuts (Control+o)')
class UploadFileAction(BaseModel):
@@ -82,19 +82,13 @@ class ExtractPageContentAction(BaseModel):
class NoParamsAction(BaseModel):
"""
Accepts absolutely anything in the incoming data
and discards it, so the final parsed model is empty.
"""
model_config = ConfigDict(extra='ignore')
# No fields defined - all inputs are ignored automatically
class GetDropdownOptionsAction(BaseModel):
index: int = Field(ge=1, description='index of the dropdown element to get the option values for')
index: int
class SelectDropdownOptionAction(BaseModel):
index: int = Field(ge=1, description='index of the dropdown element to select an option for')
text: str = Field(description='the text or exact value of the option to select')
index: int
text: str = Field(description='exact text/value')

View File

@@ -64,7 +64,7 @@ Individual DOM element interactions.
### Interactions
- `click(button='left', click_count=1, modifiers=None)` - Click element
- `fill(text: str, clear_existing=True)` - Fill input
- `fill(text: str, clear=True)` - Fill input
- `hover()`, `focus()` - Mouse/focus actions
- `check()` - Toggle checkbox/radio
- `select_option(values: str | list[str])` - Select dropdown options

View File

@@ -13,7 +13,7 @@ mode: "wide"
- `output_model_schema`: Pydantic model class for structured output validation. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py)
### Vision & Processing
- `use_vision` (default: `True`): Enable/disable vision capabilities for processing screenshots
- `use_vision` (default: `"auto"`): Vision mode - `"auto"` includes screenshot tool but only uses vision when requested, `True` always includes screenshots, `False` never includes screenshots and excludes screenshot tool
- `vision_detail_level` (default: `'auto'`): Screenshot detail level - `'low'`, `'high'`, or `'auto'`
- `page_extraction_llm`: Separate LLM model for page content extraction. You can choose a small & fast model because it only needs to extract text from the page (default: same as `llm`)

View File

@@ -12,7 +12,7 @@ Prompting can trasticly improve performance and solve existing limitations of th
```python
task = """
1. Go to https://quotes.toscrape.com/
2. Use extract_structured_data action with the query "first 3 quotes with their authors"
2. Use extract action with the query "first 3 quotes with their authors"
3. Save results to quotes.csv using write_file action
4. Do a google search for the first quote and find when it was written
"""
@@ -30,11 +30,11 @@ When you know exactly what the agent should do, reference actions by name:
```python
task = """
1. Use search action to find "Python tutorials"
2. Use click_element_by_index to open first result in a new tab
2. Use click to open first result in a new tab
3. Use scroll action to scroll down 2 pages
4. Use extract_structured_data to extract the names of the first 5 items
4. Use extract to extract the names of the first 5 items
5. Wait for 2 seconds if the page is not loaded, refresh it and wait 10 sec
6. Use send_keys action with "Tab Tab ArrowDown Enter"
6. Use send_keys action with "Tab Tab ArrowDown Enter"
"""
```

View File

@@ -252,7 +252,7 @@ llm = ChatOllama(model="llama3.1:8b")
## Qwen [example](https://github.com/browser-use/browser-use/blob/main/examples/models/qwen.py)
Currently, only `qwen-vl-max` is recommended for Browser Use. Other Qwen models, including `qwen-max`, have issues with the action schema format.
Smaller Qwen models may return incorrect action schema formats (e.g., `actions: [{"go_to_url": "google.com"}]` instead of `[{"go_to_url": {"url": "google.com"}}]`). If you want to use other models, add concrete examples of the correct action format to your prompt.
Smaller Qwen models may return incorrect action schema formats (e.g., `actions: [{"navigate": "google.com"}]` instead of `[{"navigate": {"url": "google.com"}}]`). If you want to use other models, add concrete examples of the correct action format to your prompt.
```python
from browser_use import Agent, ChatOpenAI

View File

@@ -9,37 +9,40 @@ mode: "wide"
### Navigation & Browser Control
- **`search`** - Search queries in Google
- **`go_to_url`** - Navigate to URLs
- **`search`** - Search queries (DuckDuckGo, Google, Bing)
- **`navigate`** - Navigate to URLs
- **`go_back`** - Go back in browser history
- **`wait`** - Wait for specified seconds
### Page Interaction
- **`click_element_by_index`** - Click elements by their index
- **`input_text`** - Input text into form fields
- **`upload_file_to_element`** - Upload files to file inputs
- **`click`** - Click elements by their index
- **`input`** - Input text into form fields
- **`upload_file`** - Upload files to file inputs
- **`scroll`** - Scroll the page up/down
- **`scroll_to_text`** - Scroll to specific text on page
- **`find_text`** - Scroll to specific text on page
- **`send_keys`** - Send special keys (Enter, Escape, etc.)
### JavaScript Execution
- **`execute_js`** - Execute custom JavaScript code on the page (for advanced interactions, shadow DOM, custom selectors, data extraction)
- **`evaluate`** - Execute custom JavaScript code on the page (for advanced interactions, shadow DOM, custom selectors, data extraction)
### Tab Management
- **`switch_tab`** - Switch between browser tabs
- **`close_tab`** - Close browser tabs
### Tab Management
- **`switch`** - Switch between browser tabs
- **`close`** - Close browser tabs
### Content Extraction
- **`extract_structured_data`** - Extract data from webpages using LLM
- **`extract`** - Extract data from webpages using LLM
### Visual Analysis
- **`screenshot`** - Request a screenshot in your next browser state for visual confirmation
### Form Controls
- **`get_dropdown_options`** - Get dropdown option values
- **`select_dropdown_option`** - Select dropdown options
- **`dropdown_options`** - Get dropdown option values
- **`select_dropdown`** - Select dropdown options
### File Operations
- **`write_file`** - Write content to files
- **`read_file`** - Read file contents
- **`replace_file_str`** - Replace text in files
- **`read_file`** - Read file contents
- **`replace_file`** - Replace text in files
### Task Completion
- **`done`** - Complete the task (always available)

View File

@@ -13,8 +13,8 @@ from browser_use import Agent, ChatOpenAI
llm = ChatOpenAI(model='gpt-4.1-mini')
initial_actions = [
{'go_to_url': {'url': 'https://www.google.com', 'new_tab': True}},
{'go_to_url': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}},
{'navigate': {'url': 'https://www.google.com', 'new_tab': True}},
{'navigate': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}},
]
agent = Agent(
task='What theories are displayed on the page?',

View File

@@ -24,7 +24,7 @@ Then, use append_file to add the first sentence of the article to "data.md"
Then, read the file to see its content and make sure it's correct.
Finally, share the file with me.
NOTE: DO NOT USE extract_structured_data action - everything is visible in browser state.
NOTE: DO NOT USE extract action - everything is visible in browser state.
""".strip('\n')
llm = ChatOpenAI(model='gpt-4.1-mini')

View File

@@ -13,7 +13,7 @@ base_url = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1'
# so far we only had success with qwen-vl-max
# other models, even qwen-max, do not return the right output format. They confuse the action schema.
# E.g. they return actions: [{"go_to_url": "google.com"}] instead of [{"go_to_url": {"url": "google.com"}}]
# E.g. they return actions: [{"navigate": "google.com"}] instead of [{"navigate": {"url": "google.com"}}]
# If you want to use smaller models and you see they mix up the action schema, add concrete examples to your prompt of the right format.
llm = ChatOpenAI(model='qwen-vl-max', api_key=api_key, base_url=base_url)

View File

@@ -29,7 +29,7 @@ dependencies = [
"authlib>=1.6.0",
"google-genai>=1.29.0,<2.0.0",
"openai>=1.99.2,<2.0.0",
"anthropic>=0.58.2,<1.0.0",
"anthropic>=0.68.1,<1.0.0",
"groq>=0.30.0",
"ollama>=0.5.1",
"google-api-python-client>=2.174.0",

View File

@@ -130,14 +130,14 @@ class TestClickElementEvent:
)
# Navigate to the clickable elements test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/clickable', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/clickable', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Wait for the page to load
await asyncio.sleep(0.5) # Give page time to load
@@ -218,14 +218,14 @@ class TestClickElementEvent:
)
# Navigate to the new tab test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/newTab', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/newTab', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(1) # Wait for page to load
# Count initial tabs
@@ -306,14 +306,14 @@ class TestClickElementEvent:
)
# Navigate to the comparison test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/comparison', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/comparison', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(1)
tabs = await browser_session.get_tabs()
@@ -342,7 +342,7 @@ class TestClickElementEvent:
assert len(tabs) == initial_tab_count
# Navigate back to comparison page for second test
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(1)
# Test new tab click (ctrl=True) - should open in new background tab
@@ -393,14 +393,14 @@ class TestClickElementEvent:
)
# Navigate to the page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/inline_offscreen', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/inline_offscreen', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(0.5)
# Get the clickable elements
@@ -475,14 +475,14 @@ class TestClickElementEvent:
)
# Navigate to the page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/block_in_inline', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/block_in_inline', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(0.5)
# Get the clickable elements
@@ -563,14 +563,14 @@ class TestClickElementEvent:
)
# Navigate to the page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/covered_element', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/covered_element', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(0.5)
# Get the clickable elements
@@ -623,14 +623,14 @@ class TestClickElementEvent:
)
# Navigate to the page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/file_input', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/file_input', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(0.5)
# Get the clickable elements
@@ -684,14 +684,14 @@ class TestClickElementEvent:
)
# Navigate to the page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/select_dropdown', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/select_dropdown', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(0.5)
# Get the clickable elements
@@ -1081,14 +1081,14 @@ class TestClickElementEvent:
)
# Navigate to the file upload test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/fileupload', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/fileupload', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Wait for the page to load
await asyncio.sleep(0.5)
@@ -1227,13 +1227,13 @@ class TestClickElementEvent:
)
# Navigate to the test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/upload-test', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/upload-test', new_tab=False)}
from browser_use.agent.views import ActionModel
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await asyncio.sleep(0.5)
# Get browser state to populate selector map

View File

@@ -278,12 +278,12 @@ class TestGetDropdownOptionsEvent:
async def test_native_select_dropdown(self, tools, browser_session: BrowserSession, base_url):
"""Test get_dropdown_options with native HTML select element."""
# Navigate to the native dropdown test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Initialize the DOM state to populate the selector map
await browser_session.get_browser_state_summary()
@@ -319,7 +319,7 @@ class TestGetDropdownOptionsEvent:
assert option in result.extracted_content, f"Option '{option}' not found in result content"
# Verify instruction is included
assert 'Use the exact text string' in result.extracted_content and 'select_dropdown_option' in result.extracted_content
assert 'Use the exact text string' in result.extracted_content and 'select_dropdown' in result.extracted_content
# Also test direct event dispatch
node = await browser_session.get_element_by_index(dropdown_index)
@@ -336,12 +336,12 @@ class TestGetDropdownOptionsEvent:
async def test_aria_menu_dropdown(self, tools, browser_session: BrowserSession, base_url):
"""Test get_dropdown_options with ARIA role='menu' element."""
# Navigate to the ARIA menu test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Initialize the DOM state
await browser_session.get_browser_state_summary()
@@ -398,12 +398,12 @@ class TestGetDropdownOptionsEvent:
async def test_custom_dropdown(self, tools, browser_session: BrowserSession, base_url):
"""Test get_dropdown_options with custom dropdown implementation."""
# Navigate to the custom dropdown test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Initialize the DOM state
await browser_session.get_browser_state_summary()
@@ -456,12 +456,12 @@ class TestGetDropdownOptionsEvent:
async def test_element_not_found_error(self, tools, browser_session: BrowserSession, base_url):
"""Test get_dropdown_options with invalid element index."""
# Navigate to any test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0)
# Try to get dropdown options with invalid index
@@ -486,12 +486,12 @@ class TestSelectDropdownOptionEvent:
async def test_select_native_dropdown_option(self, tools, browser_session: BrowserSession, base_url):
"""Test select_dropdown_option with native HTML select element."""
# Navigate to the native dropdown test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0)
# Initialize the DOM state
@@ -534,12 +534,12 @@ class TestSelectDropdownOptionEvent:
async def test_select_aria_menu_option(self, tools, browser_session: BrowserSession, base_url):
"""Test select_dropdown_option with ARIA menu."""
# Navigate to the ARIA menu test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0)
# Initialize the DOM state
@@ -586,12 +586,12 @@ class TestSelectDropdownOptionEvent:
async def test_select_custom_dropdown_option(self, tools, browser_session: BrowserSession, base_url):
"""Test select_dropdown_option with custom dropdown."""
# Navigate to the custom dropdown test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0)
# Initialize the DOM state
@@ -634,12 +634,12 @@ class TestSelectDropdownOptionEvent:
async def test_select_invalid_option_error(self, tools, browser_session: BrowserSession, base_url):
"""Test select_dropdown_option with non-existent option text."""
# Navigate to the native dropdown test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0)
# Initialize the DOM state

View File

@@ -152,12 +152,12 @@ class TestARIAMenuDropdown:
async def test_get_dropdown_options_with_aria_menu(self, tools, browser_session: BrowserSession, base_url):
"""Test that get_dropdown_options can retrieve options from ARIA menus."""
# Navigate to the ARIA menu test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Wait for the page to load
from browser_use.browser.events import NavigationCompleteEvent
@@ -212,19 +212,19 @@ class TestARIAMenuDropdown:
for option in expected_options:
assert option in result.extracted_content, f"Option '{option}' not found in result content"
# Verify the instruction for using the text in select_dropdown_option is included
assert 'Use the exact text string in select_dropdown_option' in result.extracted_content
# Verify the instruction for using the text in select_dropdown is included
assert 'Use the exact text string in select_dropdown' in result.extracted_content
@pytest.mark.skip(reason='TODO: fix')
async def test_select_dropdown_option_with_aria_menu(self, tools, browser_session: BrowserSession, base_url):
"""Test that select_dropdown_option can select an option from ARIA menus."""
# Navigate to the ARIA menu test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Wait for the page to load
from browser_use.browser.events import NavigationCompleteEvent
@@ -289,12 +289,12 @@ class TestARIAMenuDropdown:
async def test_get_dropdown_options_with_nested_aria_menu(self, tools, browser_session: BrowserSession, base_url):
"""Test that get_dropdown_options can handle nested ARIA menus (like Sort submenu)."""
# Navigate to the ARIA menu test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Wait for the page to load
from browser_use.browser.events import NavigationCompleteEvent
@@ -347,4 +347,4 @@ class TestARIAMenuDropdown:
assert result.extracted_content is not None
# The action should return some menu options
assert 'Use the exact text string in select_dropdown_option' in result.extracted_content
assert 'Use the exact text string in select_dropdown' in result.extracted_content

View File

@@ -59,17 +59,17 @@ def tools():
class TestNavigateToUrlEvent:
"""Test NavigateToUrlEvent and go_to_url action functionality."""
"""Test NavigateToUrlEvent and navigate action functionality."""
async def test_go_to_url_action(self, tools, browser_session: BrowserSession, base_url):
async def test_navigate_action(self, tools, browser_session: BrowserSession, base_url):
"""Test that GoToUrlAction navigates to the specified URL and test both state summary methods."""
# Test successful navigation to a valid page
action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
action_model = GoToUrlActionModel(**action_data)
action_model = NavigateActionModel(**action_data)
result = await tools.act(action_model, browser_session)
# Verify the successful navigation result
@@ -77,16 +77,16 @@ class TestNavigateToUrlEvent:
assert result.extracted_content is not None
assert f'Navigated to {base_url}' in result.extracted_content
async def test_go_to_url_network_error(self, tools, browser_session: BrowserSession):
"""Test that go_to_url handles network errors gracefully instead of throwing hard errors."""
# Create action model for go_to_url with an invalid domain
action_data = {'go_to_url': GoToUrlAction(url='https://www.nonexistentdndbeyond.com/', new_tab=False)}
async def test_navigate_network_error(self, tools, browser_session: BrowserSession):
"""Test that navigate handles network errors gracefully instead of throwing hard errors."""
# Create action model for navigate with an invalid domain
action_data = {'navigate': GoToUrlAction(url='https://www.nonexistentdndbeyond.com/', new_tab=False)}
# Create the ActionModel instance
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
action_model = GoToUrlActionModel(**action_data)
action_model = NavigateActionModel(**action_data)
# Execute the action - should return soft error instead of throwing
result = await tools.act(action_model, browser_session)
@@ -123,19 +123,19 @@ class TestNavigateToUrlEvent:
current_url = await browser_session.get_current_page_url()
assert f'{base_url}/page1' in current_url
async def test_go_to_url_new_tab(self, tools, browser_session, base_url):
async def test_navigate_new_tab(self, tools, browser_session, base_url):
"""Test that GoToUrlAction with new_tab=True opens URL in a new tab."""
# Get initial tab count
initial_tabs = await browser_session.get_tabs()
initial_tab_count = len(initial_tabs)
# Navigate to URL in new tab
action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)}
action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
result = await tools.act(GoToUrlActionModel(**action_data), browser_session)
result = await tools.act(NavigateActionModel(**action_data), browser_session)
await asyncio.sleep(0.5)
# Verify result
@@ -155,16 +155,16 @@ class TestNavigateToUrlEvent:
async def test_navigate_javascript_url(self, tools, browser_session, base_url):
"""Test that javascript: URLs are handled appropriately."""
# Navigate to a normal page first
action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**action_data), browser_session)
await tools.act(NavigateActionModel(**action_data), browser_session)
# Try to navigate to javascript: URL (should be handled gracefully)
js_action = {'go_to_url': GoToUrlAction(url='javascript:alert("test")', new_tab=False)}
result = await tools.act(GoToUrlActionModel(**js_action), browser_session)
js_action = {'navigate': GoToUrlAction(url='javascript:alert("test")', new_tab=False)}
result = await tools.act(NavigateActionModel(**js_action), browser_session)
# Should either succeed or fail gracefully
assert isinstance(result, ActionResult)
@@ -174,12 +174,12 @@ class TestNavigateToUrlEvent:
# Create a simple data URL
data_url = 'data:text/html,<html><head><title>Data URL Test</title></head><body><h1>Data URL Content</h1></body></html>'
action_data = {'go_to_url': GoToUrlAction(url=data_url, new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=data_url, new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
result = await tools.act(GoToUrlActionModel(**action_data), browser_session)
result = await tools.act(NavigateActionModel(**action_data), browser_session)
# Verify navigation
assert isinstance(result, ActionResult)
@@ -210,12 +210,12 @@ class TestNavigateToUrlEvent:
)
# Navigate to page with hash
action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/page-with-anchors#section1', new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=f'{base_url}/page-with-anchors#section1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
result = await tools.act(GoToUrlActionModel(**action_data), browser_session)
result = await tools.act(NavigateActionModel(**action_data), browser_session)
# Verify navigation
assert isinstance(result, ActionResult)
@@ -247,12 +247,12 @@ class TestNavigateToUrlEvent:
)
# Navigate with query parameters
action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/search?q=test+query&page=1', new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=f'{base_url}/search?q=test+query&page=1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
result = await tools.act(GoToUrlActionModel(**action_data), browser_session)
result = await tools.act(NavigateActionModel(**action_data), browser_session)
# Verify navigation
assert isinstance(result, ActionResult)
@@ -267,20 +267,20 @@ class TestNavigateToUrlEvent:
async def test_navigate_multiple_tabs(self, tools, browser_session, base_url):
"""Test navigating in multiple tabs sequentially."""
# Navigate to first page in current tab
action1 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
action1 = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**action1), browser_session)
await tools.act(NavigateActionModel(**action1), browser_session)
# Open second page in new tab
action2 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)}
await tools.act(GoToUrlActionModel(**action2), browser_session)
action2 = {'navigate': GoToUrlAction(url=f'{base_url}/page2', new_tab=True)}
await tools.act(NavigateActionModel(**action2), browser_session)
# Open home page in yet another new tab
action3 = {'go_to_url': GoToUrlAction(url=base_url, new_tab=True)}
await tools.act(GoToUrlActionModel(**action3), browser_session)
action3 = {'navigate': GoToUrlAction(url=base_url, new_tab=True)}
await tools.act(NavigateActionModel(**action3), browser_session)
# Should have 3 tabs now
tabs = await browser_session.get_tabs()
@@ -296,13 +296,13 @@ class TestNavigateToUrlEvent:
# Using a private IP that's unlikely to respond
timeout_url = 'http://192.0.2.1:8080/timeout'
action_data = {'go_to_url': GoToUrlAction(url=timeout_url, new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=timeout_url, new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
# This should complete without hanging indefinitely
result = await tools.act(GoToUrlActionModel(**action_data), browser_session)
result = await tools.act(NavigateActionModel(**action_data), browser_session)
# Should get a result (possibly with error)
assert isinstance(result, ActionResult)
@@ -317,12 +317,12 @@ class TestNavigateToUrlEvent:
)
# Navigate to redirect URL
action_data = {'go_to_url': GoToUrlAction(url=f'{base_url}/redirect', new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=f'{base_url}/redirect', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
result = await tools.act(GoToUrlActionModel(**action_data), browser_session)
result = await tools.act(NavigateActionModel(**action_data), browser_session)
# Verify navigation succeeded
assert isinstance(result, ActionResult)

View File

@@ -83,15 +83,15 @@ class TestScrollActions:
"""Test basic scroll action functionality."""
# Navigate to scrollable page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/scrollable', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/scrollable', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Test 1: Basic page scroll down
scroll_action = {'scroll': ScrollAction(down=True, num_pages=1.0)}
scroll_action = {'scroll': ScrollAction(down=True, pages=1.0)}
class ScrollActionModel(ActionModel):
scroll: ScrollAction | None = None
@@ -106,7 +106,7 @@ class TestScrollActions:
assert 'the page' in result.extracted_content
# Test 2: Basic page scroll up
scroll_up_action = {'scroll': ScrollAction(down=False, num_pages=0.5)}
scroll_up_action = {'scroll': ScrollAction(down=False, pages=0.5)}
result = await tools.act(ScrollActionModel(**scroll_up_action), browser_session)
assert isinstance(result, ActionResult)
@@ -116,7 +116,7 @@ class TestScrollActions:
assert '0.5 pages' in result.extracted_content
# Test 3: Test with invalid element index (should error)
invalid_scroll_action = {'scroll': ScrollAction(down=True, num_pages=1.0, frame_element_index=999)}
invalid_scroll_action = {'scroll': ScrollAction(down=True, pages=1.0, index=999)}
result = await tools.act(ScrollActionModel(**invalid_scroll_action), browser_session)
# This should fail with error about element not found
@@ -125,15 +125,15 @@ class TestScrollActions:
assert 'Element index 999 not found' in result.error or 'Failed to execute scroll' in result.error
# Test 4: Model parameter validation
scroll_with_index = ScrollAction(down=True, num_pages=1.0, frame_element_index=5)
scroll_with_index = ScrollAction(down=True, pages=1.0, index=5)
assert scroll_with_index.down is True
assert scroll_with_index.num_pages == 1.0
assert scroll_with_index.frame_element_index == 5
assert scroll_with_index.pages == 1.0
assert scroll_with_index.index == 5
scroll_without_index = ScrollAction(down=False, num_pages=0.25)
scroll_without_index = ScrollAction(down=False, pages=0.25)
assert scroll_without_index.down is False
assert scroll_without_index.num_pages == 0.25
assert scroll_without_index.frame_element_index is None
assert scroll_without_index.pages == 0.25
assert scroll_without_index.index is None
async def test_scroll_with_cross_origin_disabled(self, browser_session, base_url):
"""Test that scroll works when cross_origin_iframes is disabled."""

View File

@@ -60,7 +60,7 @@ def interactive_llm(httpserver_url):
"next_goal": "Navigate to the URL",
"action": [
{{
"go_to_url": {{
"navigate": {{
"url": "{httpserver_url}",
"new_tab": false
}}

View File

@@ -161,11 +161,11 @@ class TestDownloadUploadFullCircle:
base_url = f'http://{download_upload_server.host}:{download_upload_server.port}'
# Step 1: Navigate to download page
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
result = await tools.act(
GoToUrlActionModel(go_to_url=GoToUrlAction(url=f'{base_url}/download-page', new_tab=False)), browser_session
NavigateActionModel(navigate=GoToUrlAction(url=f'{base_url}/download-page', new_tab=False)), browser_session
)
assert result.error is None, f'Navigation to download page failed: {result.error}'
@@ -228,7 +228,7 @@ class TestDownloadUploadFullCircle:
for i, tab in enumerate(tabs_before):
print(f' Tab {i}: {tab.url}')
result = await tools.act(
GoToUrlActionModel(go_to_url=GoToUrlAction(url=f'{base_url}/upload-page', new_tab=True)), browser_session
NavigateActionModel(navigate=GoToUrlAction(url=f'{base_url}/upload-page', new_tab=True)), browser_session
)
assert result.error is None, f'Navigation to upload page failed: {result.error}'
print(f'✅ Navigation result: {result.extracted_content}')

View File

@@ -96,14 +96,14 @@ class TestToolsIntegration:
"""Test that the registry contains the expected default actions."""
# Check that common actions are registered
common_actions = [
'go_to_url',
'navigate',
'search',
'click',
'input_text',
'input',
'scroll',
'go_back',
'switch_tab',
'close_tab',
'switch',
'close',
'wait',
]
@@ -125,12 +125,12 @@ class TestToolsIntegration:
return ActionResult(extracted_content=f'Custom action executed with: {params.text} on {current_url}')
# Navigate to a page first
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Create the custom action model
custom_action_data = {'custom_action': CustomParams(text='test_value')}
@@ -209,20 +209,20 @@ class TestToolsIntegration:
async def test_go_back_action(self, tools, browser_session, base_url):
"""Test that go_back action navigates to the previous page."""
# Navigate to first page
goto_action1 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
goto_action1 = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action1), browser_session)
await tools.act(NavigateActionModel(**goto_action1), browser_session)
# Store the first page URL
first_url = await browser_session.get_current_page_url()
print(f'First page URL: {first_url}')
# Navigate to second page
goto_action2 = {'go_to_url': GoToUrlAction(url=f'{base_url}/page2', new_tab=False)}
await tools.act(GoToUrlActionModel(**goto_action2), browser_session)
goto_action2 = {'navigate': GoToUrlAction(url=f'{base_url}/page2', new_tab=False)}
await tools.act(NavigateActionModel(**goto_action2), browser_session)
# Verify we're on the second page
second_url = await browser_session.get_current_page_url()
@@ -259,12 +259,12 @@ class TestToolsIntegration:
# Navigate to each page in sequence
for url in urls:
action_data = {'go_to_url': GoToUrlAction(url=url, new_tab=False)}
action_data = {'navigate': GoToUrlAction(url=url, new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**action_data), browser_session)
await tools.act(NavigateActionModel(**action_data), browser_session)
# Verify current page
current_url = await browser_session.get_current_page_url()
@@ -293,7 +293,7 @@ class TestToolsIntegration:
assert 'scroll' not in excluded_tools.registry.registry.actions
# But other actions are still there
assert 'go_to_url' in excluded_tools.registry.registry.actions
assert 'navigate' in excluded_tools.registry.registry.actions
assert 'click' in excluded_tools.registry.registry.actions
async def test_search_action(self, tools, browser_session, base_url):
@@ -325,12 +325,12 @@ class TestToolsIntegration:
file_system = FileSystem(temp_dir)
# First navigate to a page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/page1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
success_done_message = 'Successfully completed task'
@@ -392,12 +392,12 @@ class TestToolsIntegration:
)
# Navigate to the dropdown test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/dropdown1', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/dropdown1', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Wait for the page to load using CDP
cdp_session = browser_session.agent_focus
@@ -429,13 +429,13 @@ class TestToolsIntegration:
f'Could not find select element in selector map. Available elements: {[f"{idx}: {element.tag_name}" for idx, element in selector_map.items()]}'
)
# Create a model for the standard get_dropdown_options action
class GetDropdownOptionsModel(ActionModel):
get_dropdown_options: dict[str, int]
# Create a model for the standard dropdown_options action
class DropdownOptionsModel(ActionModel):
dropdown_options: dict[str, int]
# Execute the action with the dropdown index
result = await tools.act(
action=GetDropdownOptionsModel(get_dropdown_options={'index': dropdown_index}),
action=DropdownOptionsModel(dropdown_options={'index': dropdown_index}),
browser_session=browser_session,
)
@@ -454,11 +454,8 @@ class TestToolsIntegration:
for option in expected_options[1:]: # Skip the placeholder option
assert option['text'] in result.extracted_content, f"Option '{option['text']}' not found in result content"
# Verify the instruction for using the text in select_dropdown_option is included
assert (
'Use the exact text or value string' in result.extracted_content
and 'select_dropdown_option' in result.extracted_content
)
# Verify the instruction for using the text in select_dropdown is included
assert 'Use the exact text or value string' in result.extracted_content and 'select_dropdown' in result.extracted_content
# Verify the actual dropdown options in the DOM using CDP
dropdown_options_result = await cdp_session.cdp_client.send.Runtime.evaluate(
@@ -519,12 +516,12 @@ class TestToolsIntegration:
)
# Navigate to the dropdown test page
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/dropdown2', new_tab=False)}
goto_action = {'navigate': GoToUrlAction(url=f'{base_url}/dropdown2', new_tab=False)}
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class NavigateActionModel(ActionModel):
navigate: GoToUrlAction | None = None
await tools.act(GoToUrlActionModel(**goto_action), browser_session)
await tools.act(NavigateActionModel(**goto_action), browser_session)
# Wait for the page to load using CDP
cdp_session = browser_session.agent_focus
@@ -556,13 +553,13 @@ class TestToolsIntegration:
f'Could not find select element in selector map. Available elements: {[f"{idx}: {element.tag_name}" for idx, element in selector_map.items()]}'
)
# Create a model for the standard select_dropdown_option action
class SelectDropdownOptionModel(ActionModel):
select_dropdown_option: dict
# Create a model for the standard select_dropdown action
class SelectDropdownModel(ActionModel):
select_dropdown: dict
# Execute the action with the dropdown index
result = await tools.act(
SelectDropdownOptionModel(select_dropdown_option={'index': dropdown_index, 'text': 'Second Option'}),
SelectDropdownModel(select_dropdown={'index': dropdown_index, 'text': 'Second Option'}),
browser_session,
)

View File

@@ -93,7 +93,7 @@ class TestUrlShorteningOutputProcessing:
'evaluation_previous_goal': 'Successfully processed the request',
'memory': f'Found useful info at {shortened_url}',
'next_goal': 'Complete the documentation review',
'action': [{'go_to_url': {'url': shortened_url, 'new_tab': False}}],
'action': [{'navigate': {'url': shortened_url, 'new_tab': False}}],
}
# Create properly typed AgentOutput with custom actions
@@ -109,7 +109,7 @@ class TestUrlShorteningOutputProcessing:
assert SUPER_LONG_URL in (agent_output.thinking or '')
assert SUPER_LONG_URL in (agent_output.memory or '')
action_data = agent_output.action[0].model_dump()
assert action_data['go_to_url']['url'] == SUPER_LONG_URL
assert action_data['navigate']['url'] == SUPER_LONG_URL
class TestUrlShorteningEndToEnd:
@@ -137,7 +137,7 @@ class TestUrlShorteningEndToEnd:
'evaluation_previous_goal': 'Starting documentation extraction',
'memory': f'Target URL: {shortened_url}',
'next_goal': 'Extract API documentation',
'action': [{'go_to_url': {'url': shortened_url, 'new_tab': True}}],
'action': [{'navigate': {'url': shortened_url, 'new_tab': True}}],
}
# Create AgentOutput with custom actions
@@ -153,8 +153,8 @@ class TestUrlShorteningEndToEnd:
assert SUPER_LONG_URL in (agent_output.thinking or '')
assert SUPER_LONG_URL in (agent_output.memory or '')
action_data = agent_output.action[0].model_dump()
assert action_data['go_to_url']['url'] == SUPER_LONG_URL
assert action_data['go_to_url']['new_tab'] is True
assert action_data['navigate']['url'] == SUPER_LONG_URL
assert action_data['navigate']['new_tab'] is True
# Verify original shortened content is no longer present
assert shortened_url not in (agent_output.thinking or '')

View File

@@ -38,7 +38,7 @@ async def debug_iframe_scrolling():
"next_goal": "Navigate to the iframe test page",
"action": [
{
"go_to_url": {
"navigate": {
"url": "https://browser-use.github.io/stress-tests/challenges/iframe-inception-level1.html",
"new_tab": false
}