Merge branch 'main' into bot-detection-batch

This commit is contained in:
neo773
2025-04-02 02:00:50 +05:30
committed by GitHub
25 changed files with 340 additions and 31 deletions

View File

@@ -23,3 +23,10 @@ repos:
- id: detect-private-key
- id: mixed-line-ending
- id: fix-byte-order-marker
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell # See pyproject.toml for args
additional_dependencies:
- tomli

View File

@@ -137,7 +137,7 @@ Tell your computer what to do, and it gets it done.
### Rerunning tasks
- [ ] LLM as fallback
- [ ] Make it easy to define workfow templates where LLM fills in the details
- [ ] Make it easy to define workflow templates where LLM fills in the details
- [ ] Return playwright script from the agent
### Datasets

View File

@@ -62,7 +62,7 @@ class MessageManager:
self._add_message_with_tokens(task_message, message_type='init')
if self.settings.sensitive_data:
info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}'
info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}'
info += 'To use them, write <secret>the placeholder name</secret>'
info_message = HumanMessage(content=info)
self._add_message_with_tokens(info_message, message_type='init')

View File

@@ -49,7 +49,7 @@ Common action sequences:
5. TASK COMPLETION:
- Use the done action as the last action as soon as the ultimate task is complete
- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false!
- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
- Don't hallucinate actions
- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.

View File

@@ -110,7 +110,7 @@ class Browser:
"""
Playwright browser on steroids.
This is persistant browser factory that can spawn multiple browser contexts.
This is persistent browser factory that can spawn multiple browser contexts.
It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise).
"""
@@ -180,7 +180,7 @@ class Browser:
# Check if browser is already running
response = requests.get('http://localhost:9222/json/version', timeout=2)
if response.status_code == 200:
logger.info('🔌 Re-using existing browser found running on http://localhost:9222')
logger.info('🔌 Reusing existing browser found running on http://localhost:9222')
browser_class = getattr(playwright, self.config.browser_class)
browser = await browser_class.connect_over_cdp(
endpoint_url='http://localhost:9222',

View File

@@ -71,7 +71,7 @@ CHROME_DETERMINISTIC_RENDERING_ARGS = [
# chrome://gpu
'--enable-webgl', # enable web-gl graphics support
'--font-render-hinting=none', # make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
'--force-color-profile=srgb', # make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb
'--force-color-profile=srgb', # make rendering more deterministic by using consistent color profile, if browser looks weird, try: generic-rgb
'--disable-partial-raster', # make rendering more deterministic (TODO: verify if still needed)
'--disable-skia-runtime-opts', # make rendering more deterministic by avoiding Skia hot path runtime optimizations
'--disable-2d-canvas-clip-aa', # make rendering more deterministic by disabling antialiasing on 2d canvas clips

View File

@@ -106,6 +106,10 @@ class BrowserContextConfig(BaseModel):
include_dynamic_attributes: bool = True
Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False.
http_credentials: None
Dictionary with HTTP basic authentication credentials for corporate intranets (only supports one set of credentials for all URLs at the moment), e.g.
{"username": "bill", "password": "pa55w0rd"}
is_mobile: None
Whether the meta viewport tag is taken into account and touch events are enabled.
@@ -153,6 +157,7 @@ class BrowserContextConfig(BaseModel):
viewport_expansion: int = 500
allowed_domains: list[str] | None = None
include_dynamic_attributes: bool = True
http_credentials: dict[str, str] | None = None
keep_alive: bool = Field(default=False, alias='_force_keep_context_alive') # used to be called _force_keep_context_alive
is_mobile: bool | None = None
@@ -417,6 +422,7 @@ class BrowserContext:
record_video_size=self.config.browser_window_size,
record_har_path=self.config.save_har_path,
locale=self.config.locale,
http_credentials=self.config.http_credentials,
is_mobile=self.config.is_mobile,
has_touch=self.config.has_touch,
geolocation=self.config.geolocation,
@@ -873,7 +879,7 @@ class BrowserContext:
# Get all cross-origin iframes within the page and open them in new tabs
# mark the titles of the new tabs so the LLM knows to check them for additional content
# unfortunately too buggy for now, too many sites use invisible cross-origin iframes for ads, tracking, youtube videos, social media, etc.
# and it distracts the bot by openeing a lot of new tabs
# and it distracts the bot by opening a lot of new tabs
# iframe_urls = await dom_service.get_cross_origin_iframes()
# for url in iframe_urls:
# if url in [tab.url for tab in tabs_info]:
@@ -1363,7 +1369,7 @@ class BrowserContext:
try:
tab_info = TabInfo(page_id=page_id, url=page.url, title=await asyncio.wait_for(page.title(), timeout=1))
except asyncio.TimeoutError:
# page.title() can hang forever on tabs that are crashed/dissapeared/about:blank
# page.title() can hang forever on tabs that are crashed/disappeared/about:blank
# we dont want to try automating those tabs because they will hang the whole script
logger.debug('⚠ Failed to get tab info for tab #%s: %s (ignoring)', page_id, page.url)
tab_info = TabInfo(page_id=page_id, url='about:blank', title='ignore this tab and do not use it')

View File

@@ -4,10 +4,11 @@ import enum
import json
import logging
import re
from typing import Dict, Generic, Optional, Type, TypeVar
from typing import Dict, Generic, Optional, Tuple, Type, TypeVar, cast
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import PromptTemplate
from playwright.async_api import ElementHandle, Page
# from lmnr.sdk.laminar import Laminar
from pydantic import BaseModel
@@ -22,10 +23,12 @@ from browser_use.controller.views import (
ClickElementByXpathAction,
CloseTabAction,
DoneAction,
DragDropAction,
GoToUrlAction,
InputTextAction,
NoParamsAction,
OpenTabAction,
Position,
ScrollAction,
SearchGoogleAction,
SendKeysAction,
@@ -57,7 +60,7 @@ class Controller(Generic[Context]):
data: output_model
@self.registry.action(
'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached',
param_model=ExtendedOutputModel,
)
async def done(params: ExtendedOutputModel):
@@ -73,7 +76,7 @@ class Controller(Generic[Context]):
else:
@self.registry.action(
'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached',
param_model=DoneAction,
)
async def done(params: DoneAction):
@@ -299,7 +302,7 @@ class Controller(Generic[Context]):
# Content Actions
@self.registry.action(
'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links',
'Extract page content to retrieve specific information from the page, e.g. all company names, a specific description, all information about, links with companies in structured format or simply links',
)
async def extract_content(
goal: str, should_strip_link_urls: bool, browser: BrowserContext, page_extraction_llm: BaseChatModel
@@ -627,6 +630,220 @@ class Controller(Generic[Context]):
logger.error(msg)
return ActionResult(error=msg, include_in_memory=True)
@self.registry.action(
'Drag and drop elements or between coordinates on the page - useful for canvas drawing, sortable lists, sliders, file uploads, and UI rearrangement',
param_model=DragDropAction,
)
async def drag_drop(params: DragDropAction, browser: BrowserContext) -> ActionResult:
"""
Performs a precise drag and drop operation between elements or coordinates.
"""
async def get_drag_elements(
page: Page,
source_selector: str,
target_selector: str,
) -> Tuple[Optional[ElementHandle], Optional[ElementHandle]]:
"""Get source and target elements with appropriate error handling."""
source_element = None
target_element = None
try:
# page.locator() auto-detects CSS and XPath
source_locator = page.locator(source_selector)
target_locator = page.locator(target_selector)
# Check if elements exist
source_count = await source_locator.count()
target_count = await target_locator.count()
if source_count > 0:
source_element = await source_locator.first.element_handle()
logger.debug(f'Found source element with selector: {source_selector}')
else:
logger.warning(f'Source element not found: {source_selector}')
if target_count > 0:
target_element = await target_locator.first.element_handle()
logger.debug(f'Found target element with selector: {target_selector}')
else:
logger.warning(f'Target element not found: {target_selector}')
except Exception as e:
logger.error(f'Error finding elements: {str(e)}')
return source_element, target_element
async def get_element_coordinates(
source_element: ElementHandle,
target_element: ElementHandle,
source_position: Optional[Position],
target_position: Optional[Position],
) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]:
"""Get coordinates from elements with appropriate error handling."""
source_coords = None
target_coords = None
try:
# Get source coordinates
if source_position:
source_coords = (source_position.x, source_position.y)
else:
source_box = await source_element.bounding_box()
if source_box:
source_coords = (
int(source_box['x'] + source_box['width'] / 2),
int(source_box['y'] + source_box['height'] / 2),
)
# Get target coordinates
if target_position:
target_coords = (target_position.x, target_position.y)
else:
target_box = await target_element.bounding_box()
if target_box:
target_coords = (
int(target_box['x'] + target_box['width'] / 2),
int(target_box['y'] + target_box['height'] / 2),
)
except Exception as e:
logger.error(f'Error getting element coordinates: {str(e)}')
return source_coords, target_coords
async def execute_drag_operation(
page: Page,
source_x: int,
source_y: int,
target_x: int,
target_y: int,
steps: int,
delay_ms: int,
) -> Tuple[bool, str]:
"""Execute the drag operation with comprehensive error handling."""
try:
# Try to move to source position
try:
await page.mouse.move(source_x, source_y)
logger.debug(f'Moved to source position ({source_x}, {source_y})')
except Exception as e:
logger.error(f'Failed to move to source position: {str(e)}')
return False, f'Failed to move to source position: {str(e)}'
# Press mouse button down
await page.mouse.down()
# Move to target position with intermediate steps
for i in range(1, steps + 1):
ratio = i / steps
intermediate_x = int(source_x + (target_x - source_x) * ratio)
intermediate_y = int(source_y + (target_y - source_y) * ratio)
await page.mouse.move(intermediate_x, intermediate_y)
if delay_ms > 0:
await asyncio.sleep(delay_ms / 1000)
# Move to final target position
await page.mouse.move(target_x, target_y)
# Move again to ensure dragover events are properly triggered
await page.mouse.move(target_x, target_y)
# Release mouse button
await page.mouse.up()
return True, 'Drag operation completed successfully'
except Exception as e:
return False, f'Error during drag operation: {str(e)}'
page = await browser.get_current_page()
try:
# Initialize variables
source_x: Optional[int] = None
source_y: Optional[int] = None
target_x: Optional[int] = None
target_y: Optional[int] = None
# Normalize parameters
steps = max(1, params.steps or 10)
delay_ms = max(0, params.delay_ms or 5)
# Case 1: Element selectors provided
if params.element_source and params.element_target:
logger.debug('Using element-based approach with selectors')
source_element, target_element = await get_drag_elements(
page,
params.element_source,
params.element_target,
)
if not source_element or not target_element:
error_msg = f'Failed to find {"source" if not source_element else "target"} element'
return ActionResult(error=error_msg, include_in_memory=True)
source_coords, target_coords = await get_element_coordinates(
source_element, target_element, params.element_source_offset, params.element_target_offset
)
if not source_coords or not target_coords:
error_msg = f'Failed to determine {"source" if not source_coords else "target"} coordinates'
return ActionResult(error=error_msg, include_in_memory=True)
source_x, source_y = source_coords
target_x, target_y = target_coords
# Case 2: Coordinates provided directly
elif all(
coord is not None
for coord in [params.coord_source_x, params.coord_source_y, params.coord_target_x, params.coord_target_y]
):
logger.debug('Using coordinate-based approach')
source_x = params.coord_source_x
source_y = params.coord_source_y
target_x = params.coord_target_x
target_y = params.coord_target_y
else:
error_msg = 'Must provide either source/target selectors or source/target coordinates'
return ActionResult(error=error_msg, include_in_memory=True)
# Validate coordinates
if any(coord is None for coord in [source_x, source_y, target_x, target_y]):
error_msg = 'Failed to determine source or target coordinates'
return ActionResult(error=error_msg, include_in_memory=True)
# Perform the drag operation
success, message = await execute_drag_operation(
page,
cast(int, source_x),
cast(int, source_y),
cast(int, target_x),
cast(int, target_y),
steps,
delay_ms,
)
if not success:
logger.error(f'Drag operation failed: {message}')
return ActionResult(error=message, include_in_memory=True)
# Create descriptive message
if params.element_source and params.element_target:
msg = f"🖱️ Dragged element '{params.element_source}' to '{params.element_target}'"
else:
msg = f'🖱️ Dragged from ({source_x}, {source_y}) to ({target_x}, {target_y})'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
except Exception as e:
error_msg = f'Failed to perform drag and drop: {str(e)}'
logger.error(error_msg)
return ActionResult(error=error_msg, include_in_memory=True)
# Register ---------------------------------------------------------------
def action(self, description: str, **kwargs):

View File

@@ -96,3 +96,30 @@ class NoParamsAction(BaseModel):
def ignore_all_inputs(cls, values):
# No matter what the user sends, discard it and return empty.
return {}
class Position(BaseModel):
x: int
y: int
class DragDropAction(BaseModel):
# Element-based approach
element_source: Optional[str] = Field(None, description='CSS selector or XPath of the element to drag from')
element_target: Optional[str] = Field(None, description='CSS selector or XPath of the element to drop onto')
element_source_offset: Optional[Position] = Field(
None, description='Precise position within the source element to start drag (in pixels from top-left corner)'
)
element_target_offset: Optional[Position] = Field(
None, description='Precise position within the target element to drop (in pixels from top-left corner)'
)
# Coordinate-based approach (used if selectors not provided)
coord_source_x: Optional[int] = Field(None, description='Absolute X coordinate on page to start drag from (in pixels)')
coord_source_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to start drag from (in pixels)')
coord_target_x: Optional[int] = Field(None, description='Absolute X coordinate on page to drop at (in pixels)')
coord_target_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to drop at (in pixels)')
# Common options
steps: Optional[int] = Field(10, description='Number of intermediate points for smoother movement (5-20 recommended)')
delay_ms: Optional[int] = Field(5, description='Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)')

View File

@@ -93,7 +93,7 @@ class SignalHandler:
except Exception:
# there are situations where signal handlers are not supported, e.g.
# - when running in a thread other than the main thread
# - some opearating systems
# - some operating systems
# - inside jupyter notebooks
pass

View File

@@ -47,7 +47,8 @@ agent = Agent(
- Disable to reduce costs or use models without vision support
- For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size)
- `save_conversation_path`: Path to save the complete conversation history. Useful for debugging.
- `system_prompt_class`: Custom system prompt class. See <a href="/customize/system-prompt">System Prompt</a> for customization options.
- `override_system_message`: Completely replace the default system prompt with a custom one.
- `extend_system_message`: Add additional instructions to the default system prompt.
<Note>
Vision capabilities are recommended for better web interaction understanding,
@@ -183,7 +184,7 @@ agent = Agent(
You can configure the agent and provide a separate message to help the LLM understand the task better.
```python
```python
from langchain_openai import ChatOpenAI
agent = Agent(

View File

@@ -161,8 +161,8 @@ async def run_search():
Highlight interactive elements on the screen with colorful bounding boxes.
- **viewport_expansion** (default: `500`)
Viewport expansion in pixels. With this you can controll how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
Default is 500 pixels, that means that we inlcude a little bit more than the visible viewport inside the context.
Viewport expansion in pixels. With this you can control how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
Default is 500 pixels, that means that we include a little bit more than the visible viewport inside the context.
### Restrict URLs

View File

@@ -55,7 +55,7 @@ from browser_use import Browser, Controller, ActionResult
controller = Controller()
@controller.action('Open website')
async def open_website(url: str, browser: Browser):
page = browser.get_current_page()
page = await browser.get_current_page()
await page.goto(url)
return ActionResult(extracted_content='Website opened')
```

View File

@@ -43,7 +43,7 @@ if __name__ == '__main__':
In this example:
1. The model only sees `x_name` and `x_password` as placeholders.
2. When the model wants to use your password it outputs x_password - and we replace it with the actual value.
3. When your password is visable on the current page, we replace it in the LLM input - so that the model never has it in its state.
3. When your password is visible on the current page, we replace it in the LLM input - so that the model never has it in its state.
Warning: Vision models still see the image of the page - where the sensitive data might be visible.

View File

@@ -64,7 +64,7 @@ async def main():
llm = ChatOpenAI(model_name='gpt-4o')
# Create the agent
agent = Agent( # disco mode will not be triggered on apple.com because the LLM wont be able to see that action available, it should work on Google.com though.
agent = Agent( # disco mode will not be triggered on apple.com because the LLM won't be able to see that action available, it should work on Google.com though.
task="""
Go to apple.com and trigger disco mode (if dont know how to do that, then just move on).
Then go to google.com and trigger disco mode.

View File

@@ -21,7 +21,7 @@ async def done(text: str):
# To send emails use
# STEP 1: go to https://support.google.com/accounts/answer/185833
# STEP 2: Create an app password (you cant use here your normal gmail password)
# STEP 2: Create an app password (you can't use here your normal gmail password)
# STEP 3: Use the app password in the code below for the password
yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password')
yag.send(

View File

@@ -0,0 +1,46 @@
import asyncio
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import Agent
load_dotenv()
api_key = os.getenv('GEMINI_API_KEY')
if not api_key:
raise ValueError('GEMINI_API_KEY is not set')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
task_1 = """
Navigate to: https://sortablejs.github.io/Sortable/.
Then scroll down to the first examplw with title "Simple list example".
Drag the element with name "item 1" to below the element with name "item 3".
"""
task_2 = """
Navigate to: https://excalidraw.com/.
Click on the pencil icon (with index 40).
Then draw a triangle in the canvas.
Draw the triangle starting from coordinate (400,400).
You can use the drag and drop action to draw the triangle.
"""
async def run_search():
agent = Agent(
task=task_1,
llm=llm,
max_actions_per_step=1,
use_vision=True,
)
await agent.run(max_steps=25)
if __name__ == '__main__':
asyncio.run(run_search())

View File

@@ -13,7 +13,7 @@ from browser_use.browser.browser import Browser, BrowserConfig
llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
task = (
'go to google.com and search for openai.com and click on the first link then extract content and scroll down - whats there?'
"go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?"
)
allowed_domains = ['google.com']

View File

@@ -1,5 +1,5 @@
"""
Demostrate output validator.
Demonstrate output validator.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""

View File

@@ -29,7 +29,7 @@ Five Steps to create and invite a Discord bot:
* Click “Authorize”.
--> Note: The person adding the bot needs "Manage Server" permissions.
6. Run the code below to start the bot with your bot token.
7. Write e.g. "/bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel.
7. Write e.g. "/bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel.
"""
import os

View File

@@ -38,14 +38,14 @@ Steps to create and configure a Slack bot:
6. Invite the bot to a channel:
* Use the `/invite @your-bot-name` command in the Slack channel where you want the bot to be active.
7. Run the code in `examples/slack_example.py` to start the bot with your bot token and signing secret.
8. Write e.g. "$bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel.
8. Write e.g. "$bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel.
## Installing and Starting ngrok
To expose your local server to the internet, you can use ngrok. Follow these steps to install and start ngrok:
1. Download ngrok from the official website: https://ngrok.com/download
2. Create a free account and follow the offical steps to install ngrok.
2. Create a free account and follow the official steps to install ngrok.
3. Start ngrok by running the following command in your terminal:
```sh
ngrok http 3000

View File

@@ -102,7 +102,7 @@ async def input_selected_cell_text(browser: BrowserContext, text: str):
page = await browser.get_current_page()
await page.keyboard.type(text, delay=0.1)
await page.keyboard.press('Enter') # make sure to commit the input so it doesnt get overwritten by the next action
await page.keyboard.press('Enter') # make sure to commit the input so it doesn't get overwritten by the next action
await page.keyboard.press('ArrowUp')
return ActionResult(extracted_content=f'Inputted text {text}', include_in_memory=False)
@@ -128,7 +128,7 @@ async def update_range_contents(browser: BrowserContext, range: str, new_content
# - https://github.com/philc/sheetkeys/blob/master/content_scripts/commands.js
# - https://support.google.com/docs/answer/181110?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac-shortcuts
# Tip: LLM is bad at spacial reasoning, don't make it navigate with arrow keys relative to current cell
# Tip: LLM is bad at spatial reasoning, don't make it navigate with arrow keys relative to current cell
# if given arrow keys, it will try to jump from G1 to A2 by pressing Down, without realizing needs to go Down+LeftLeftLeftLeft

View File

@@ -77,7 +77,7 @@ At this stage, check the basket on the top right (indicates the price) and check
- If the total order **is below CHF 99**, add **a liquid soap refill** to reach the minimum. If it;s still you can buy some bread, dark chockolate.
- At this step, check if you have bought MORE items than needed. If the price is more then CHF200, you MUST remove items.
- If an item is not available, choose an alternative.
- if an age verification is needed, remove alchoholic products, we haven't verified yet.
- if an age verification is needed, remove alcoholic products, we haven't verified yet.
---

View File

@@ -49,6 +49,10 @@ dependencies = [
urls = { "Repository" = "https://github.com/browser-use/browser-use" }
[tool.codespell]
ignore-words-list = "bu"
skip = "*.json"
[tool.ruff]
line-length = 130
fix = true
@@ -89,4 +93,5 @@ dev-dependencies = [
"langchain-fireworks>=0.2.6",
"ipdb>=0.13.13",
"pre-commit>=4.2.0",
"codespell>=2.4.1",
]

View File

@@ -23,7 +23,7 @@ controller = Controller()
# use this test to ask the model questions about the page like
# which color do you see for bbox labels, list all with their label
# whats the smallest bboxes with labels and
# what's the smallest bboxes with labels and
@controller.registry.action(description='explain what you see on the screen and ask user for input')
@@ -40,7 +40,7 @@ async def done(text: str) -> str:
agent = Agent(
task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to expalin it and get the next question',
task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to explain it and get the next question',
llm=llm,
controller=controller,
browser=Browser(config=BrowserConfig(disable_security=True, headless=False)),