mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
Merge branch 'main' into bot-detection-batch
This commit is contained in:
@@ -23,3 +23,10 @@ repos:
|
||||
- id: detect-private-key
|
||||
- id: mixed-line-ending
|
||||
- id: fix-byte-order-marker
|
||||
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
hooks:
|
||||
- id: codespell # See pyproject.toml for args
|
||||
additional_dependencies:
|
||||
- tomli
|
||||
|
||||
@@ -137,7 +137,7 @@ Tell your computer what to do, and it gets it done.
|
||||
### Rerunning tasks
|
||||
|
||||
- [ ] LLM as fallback
|
||||
- [ ] Make it easy to define workfow templates where LLM fills in the details
|
||||
- [ ] Make it easy to define workflow templates where LLM fills in the details
|
||||
- [ ] Return playwright script from the agent
|
||||
|
||||
### Datasets
|
||||
|
||||
@@ -62,7 +62,7 @@ class MessageManager:
|
||||
self._add_message_with_tokens(task_message, message_type='init')
|
||||
|
||||
if self.settings.sensitive_data:
|
||||
info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}'
|
||||
info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}'
|
||||
info += 'To use them, write <secret>the placeholder name</secret>'
|
||||
info_message = HumanMessage(content=info)
|
||||
self._add_message_with_tokens(info_message, message_type='init')
|
||||
|
||||
@@ -49,7 +49,7 @@ Common action sequences:
|
||||
5. TASK COMPLETION:
|
||||
- Use the done action as the last action as soon as the ultimate task is complete
|
||||
- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
|
||||
- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
|
||||
- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false!
|
||||
- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
|
||||
- Don't hallucinate actions
|
||||
- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
|
||||
|
||||
@@ -110,7 +110,7 @@ class Browser:
|
||||
"""
|
||||
Playwright browser on steroids.
|
||||
|
||||
This is persistant browser factory that can spawn multiple browser contexts.
|
||||
This is persistent browser factory that can spawn multiple browser contexts.
|
||||
It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise).
|
||||
"""
|
||||
|
||||
@@ -180,7 +180,7 @@ class Browser:
|
||||
# Check if browser is already running
|
||||
response = requests.get('http://localhost:9222/json/version', timeout=2)
|
||||
if response.status_code == 200:
|
||||
logger.info('🔌 Re-using existing browser found running on http://localhost:9222')
|
||||
logger.info('🔌 Reusing existing browser found running on http://localhost:9222')
|
||||
browser_class = getattr(playwright, self.config.browser_class)
|
||||
browser = await browser_class.connect_over_cdp(
|
||||
endpoint_url='http://localhost:9222',
|
||||
|
||||
@@ -71,7 +71,7 @@ CHROME_DETERMINISTIC_RENDERING_ARGS = [
|
||||
# chrome://gpu
|
||||
'--enable-webgl', # enable web-gl graphics support
|
||||
'--font-render-hinting=none', # make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
|
||||
'--force-color-profile=srgb', # make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb
|
||||
'--force-color-profile=srgb', # make rendering more deterministic by using consistent color profile, if browser looks weird, try: generic-rgb
|
||||
'--disable-partial-raster', # make rendering more deterministic (TODO: verify if still needed)
|
||||
'--disable-skia-runtime-opts', # make rendering more deterministic by avoiding Skia hot path runtime optimizations
|
||||
'--disable-2d-canvas-clip-aa', # make rendering more deterministic by disabling antialiasing on 2d canvas clips
|
||||
|
||||
@@ -106,6 +106,10 @@ class BrowserContextConfig(BaseModel):
|
||||
include_dynamic_attributes: bool = True
|
||||
Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False.
|
||||
|
||||
http_credentials: None
|
||||
Dictionary with HTTP basic authentication credentials for corporate intranets (only supports one set of credentials for all URLs at the moment), e.g.
|
||||
{"username": "bill", "password": "pa55w0rd"}
|
||||
|
||||
is_mobile: None
|
||||
Whether the meta viewport tag is taken into account and touch events are enabled.
|
||||
|
||||
@@ -153,6 +157,7 @@ class BrowserContextConfig(BaseModel):
|
||||
viewport_expansion: int = 500
|
||||
allowed_domains: list[str] | None = None
|
||||
include_dynamic_attributes: bool = True
|
||||
http_credentials: dict[str, str] | None = None
|
||||
|
||||
keep_alive: bool = Field(default=False, alias='_force_keep_context_alive') # used to be called _force_keep_context_alive
|
||||
is_mobile: bool | None = None
|
||||
@@ -417,6 +422,7 @@ class BrowserContext:
|
||||
record_video_size=self.config.browser_window_size,
|
||||
record_har_path=self.config.save_har_path,
|
||||
locale=self.config.locale,
|
||||
http_credentials=self.config.http_credentials,
|
||||
is_mobile=self.config.is_mobile,
|
||||
has_touch=self.config.has_touch,
|
||||
geolocation=self.config.geolocation,
|
||||
@@ -873,7 +879,7 @@ class BrowserContext:
|
||||
# Get all cross-origin iframes within the page and open them in new tabs
|
||||
# mark the titles of the new tabs so the LLM knows to check them for additional content
|
||||
# unfortunately too buggy for now, too many sites use invisible cross-origin iframes for ads, tracking, youtube videos, social media, etc.
|
||||
# and it distracts the bot by openeing a lot of new tabs
|
||||
# and it distracts the bot by opening a lot of new tabs
|
||||
# iframe_urls = await dom_service.get_cross_origin_iframes()
|
||||
# for url in iframe_urls:
|
||||
# if url in [tab.url for tab in tabs_info]:
|
||||
@@ -1363,7 +1369,7 @@ class BrowserContext:
|
||||
try:
|
||||
tab_info = TabInfo(page_id=page_id, url=page.url, title=await asyncio.wait_for(page.title(), timeout=1))
|
||||
except asyncio.TimeoutError:
|
||||
# page.title() can hang forever on tabs that are crashed/dissapeared/about:blank
|
||||
# page.title() can hang forever on tabs that are crashed/disappeared/about:blank
|
||||
# we dont want to try automating those tabs because they will hang the whole script
|
||||
logger.debug('⚠ Failed to get tab info for tab #%s: %s (ignoring)', page_id, page.url)
|
||||
tab_info = TabInfo(page_id=page_id, url='about:blank', title='ignore this tab and do not use it')
|
||||
|
||||
@@ -4,10 +4,11 @@ import enum
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, Generic, Optional, Type, TypeVar
|
||||
from typing import Dict, Generic, Optional, Tuple, Type, TypeVar, cast
|
||||
|
||||
from langchain_core.language_models.chat_models import BaseChatModel
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from playwright.async_api import ElementHandle, Page
|
||||
|
||||
# from lmnr.sdk.laminar import Laminar
|
||||
from pydantic import BaseModel
|
||||
@@ -22,10 +23,12 @@ from browser_use.controller.views import (
|
||||
ClickElementByXpathAction,
|
||||
CloseTabAction,
|
||||
DoneAction,
|
||||
DragDropAction,
|
||||
GoToUrlAction,
|
||||
InputTextAction,
|
||||
NoParamsAction,
|
||||
OpenTabAction,
|
||||
Position,
|
||||
ScrollAction,
|
||||
SearchGoogleAction,
|
||||
SendKeysAction,
|
||||
@@ -57,7 +60,7 @@ class Controller(Generic[Context]):
|
||||
data: output_model
|
||||
|
||||
@self.registry.action(
|
||||
'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
|
||||
'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached',
|
||||
param_model=ExtendedOutputModel,
|
||||
)
|
||||
async def done(params: ExtendedOutputModel):
|
||||
@@ -73,7 +76,7 @@ class Controller(Generic[Context]):
|
||||
else:
|
||||
|
||||
@self.registry.action(
|
||||
'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
|
||||
'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached',
|
||||
param_model=DoneAction,
|
||||
)
|
||||
async def done(params: DoneAction):
|
||||
@@ -299,7 +302,7 @@ class Controller(Generic[Context]):
|
||||
|
||||
# Content Actions
|
||||
@self.registry.action(
|
||||
'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links',
|
||||
'Extract page content to retrieve specific information from the page, e.g. all company names, a specific description, all information about, links with companies in structured format or simply links',
|
||||
)
|
||||
async def extract_content(
|
||||
goal: str, should_strip_link_urls: bool, browser: BrowserContext, page_extraction_llm: BaseChatModel
|
||||
@@ -627,6 +630,220 @@ class Controller(Generic[Context]):
|
||||
logger.error(msg)
|
||||
return ActionResult(error=msg, include_in_memory=True)
|
||||
|
||||
@self.registry.action(
|
||||
'Drag and drop elements or between coordinates on the page - useful for canvas drawing, sortable lists, sliders, file uploads, and UI rearrangement',
|
||||
param_model=DragDropAction,
|
||||
)
|
||||
async def drag_drop(params: DragDropAction, browser: BrowserContext) -> ActionResult:
|
||||
"""
|
||||
Performs a precise drag and drop operation between elements or coordinates.
|
||||
"""
|
||||
|
||||
async def get_drag_elements(
|
||||
page: Page,
|
||||
source_selector: str,
|
||||
target_selector: str,
|
||||
) -> Tuple[Optional[ElementHandle], Optional[ElementHandle]]:
|
||||
"""Get source and target elements with appropriate error handling."""
|
||||
source_element = None
|
||||
target_element = None
|
||||
|
||||
try:
|
||||
# page.locator() auto-detects CSS and XPath
|
||||
source_locator = page.locator(source_selector)
|
||||
target_locator = page.locator(target_selector)
|
||||
|
||||
# Check if elements exist
|
||||
source_count = await source_locator.count()
|
||||
target_count = await target_locator.count()
|
||||
|
||||
if source_count > 0:
|
||||
source_element = await source_locator.first.element_handle()
|
||||
logger.debug(f'Found source element with selector: {source_selector}')
|
||||
else:
|
||||
logger.warning(f'Source element not found: {source_selector}')
|
||||
|
||||
if target_count > 0:
|
||||
target_element = await target_locator.first.element_handle()
|
||||
logger.debug(f'Found target element with selector: {target_selector}')
|
||||
else:
|
||||
logger.warning(f'Target element not found: {target_selector}')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Error finding elements: {str(e)}')
|
||||
|
||||
return source_element, target_element
|
||||
|
||||
async def get_element_coordinates(
|
||||
source_element: ElementHandle,
|
||||
target_element: ElementHandle,
|
||||
source_position: Optional[Position],
|
||||
target_position: Optional[Position],
|
||||
) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]:
|
||||
"""Get coordinates from elements with appropriate error handling."""
|
||||
source_coords = None
|
||||
target_coords = None
|
||||
|
||||
try:
|
||||
# Get source coordinates
|
||||
if source_position:
|
||||
source_coords = (source_position.x, source_position.y)
|
||||
else:
|
||||
source_box = await source_element.bounding_box()
|
||||
if source_box:
|
||||
source_coords = (
|
||||
int(source_box['x'] + source_box['width'] / 2),
|
||||
int(source_box['y'] + source_box['height'] / 2),
|
||||
)
|
||||
|
||||
# Get target coordinates
|
||||
if target_position:
|
||||
target_coords = (target_position.x, target_position.y)
|
||||
else:
|
||||
target_box = await target_element.bounding_box()
|
||||
if target_box:
|
||||
target_coords = (
|
||||
int(target_box['x'] + target_box['width'] / 2),
|
||||
int(target_box['y'] + target_box['height'] / 2),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f'Error getting element coordinates: {str(e)}')
|
||||
|
||||
return source_coords, target_coords
|
||||
|
||||
async def execute_drag_operation(
|
||||
page: Page,
|
||||
source_x: int,
|
||||
source_y: int,
|
||||
target_x: int,
|
||||
target_y: int,
|
||||
steps: int,
|
||||
delay_ms: int,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Execute the drag operation with comprehensive error handling."""
|
||||
try:
|
||||
# Try to move to source position
|
||||
try:
|
||||
await page.mouse.move(source_x, source_y)
|
||||
logger.debug(f'Moved to source position ({source_x}, {source_y})')
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to move to source position: {str(e)}')
|
||||
return False, f'Failed to move to source position: {str(e)}'
|
||||
|
||||
# Press mouse button down
|
||||
await page.mouse.down()
|
||||
|
||||
# Move to target position with intermediate steps
|
||||
for i in range(1, steps + 1):
|
||||
ratio = i / steps
|
||||
intermediate_x = int(source_x + (target_x - source_x) * ratio)
|
||||
intermediate_y = int(source_y + (target_y - source_y) * ratio)
|
||||
|
||||
await page.mouse.move(intermediate_x, intermediate_y)
|
||||
|
||||
if delay_ms > 0:
|
||||
await asyncio.sleep(delay_ms / 1000)
|
||||
|
||||
# Move to final target position
|
||||
await page.mouse.move(target_x, target_y)
|
||||
|
||||
# Move again to ensure dragover events are properly triggered
|
||||
await page.mouse.move(target_x, target_y)
|
||||
|
||||
# Release mouse button
|
||||
await page.mouse.up()
|
||||
|
||||
return True, 'Drag operation completed successfully'
|
||||
|
||||
except Exception as e:
|
||||
return False, f'Error during drag operation: {str(e)}'
|
||||
|
||||
page = await browser.get_current_page()
|
||||
|
||||
try:
|
||||
# Initialize variables
|
||||
source_x: Optional[int] = None
|
||||
source_y: Optional[int] = None
|
||||
target_x: Optional[int] = None
|
||||
target_y: Optional[int] = None
|
||||
|
||||
# Normalize parameters
|
||||
steps = max(1, params.steps or 10)
|
||||
delay_ms = max(0, params.delay_ms or 5)
|
||||
|
||||
# Case 1: Element selectors provided
|
||||
if params.element_source and params.element_target:
|
||||
logger.debug('Using element-based approach with selectors')
|
||||
|
||||
source_element, target_element = await get_drag_elements(
|
||||
page,
|
||||
params.element_source,
|
||||
params.element_target,
|
||||
)
|
||||
|
||||
if not source_element or not target_element:
|
||||
error_msg = f'Failed to find {"source" if not source_element else "target"} element'
|
||||
return ActionResult(error=error_msg, include_in_memory=True)
|
||||
|
||||
source_coords, target_coords = await get_element_coordinates(
|
||||
source_element, target_element, params.element_source_offset, params.element_target_offset
|
||||
)
|
||||
|
||||
if not source_coords or not target_coords:
|
||||
error_msg = f'Failed to determine {"source" if not source_coords else "target"} coordinates'
|
||||
return ActionResult(error=error_msg, include_in_memory=True)
|
||||
|
||||
source_x, source_y = source_coords
|
||||
target_x, target_y = target_coords
|
||||
|
||||
# Case 2: Coordinates provided directly
|
||||
elif all(
|
||||
coord is not None
|
||||
for coord in [params.coord_source_x, params.coord_source_y, params.coord_target_x, params.coord_target_y]
|
||||
):
|
||||
logger.debug('Using coordinate-based approach')
|
||||
source_x = params.coord_source_x
|
||||
source_y = params.coord_source_y
|
||||
target_x = params.coord_target_x
|
||||
target_y = params.coord_target_y
|
||||
else:
|
||||
error_msg = 'Must provide either source/target selectors or source/target coordinates'
|
||||
return ActionResult(error=error_msg, include_in_memory=True)
|
||||
|
||||
# Validate coordinates
|
||||
if any(coord is None for coord in [source_x, source_y, target_x, target_y]):
|
||||
error_msg = 'Failed to determine source or target coordinates'
|
||||
return ActionResult(error=error_msg, include_in_memory=True)
|
||||
|
||||
# Perform the drag operation
|
||||
success, message = await execute_drag_operation(
|
||||
page,
|
||||
cast(int, source_x),
|
||||
cast(int, source_y),
|
||||
cast(int, target_x),
|
||||
cast(int, target_y),
|
||||
steps,
|
||||
delay_ms,
|
||||
)
|
||||
|
||||
if not success:
|
||||
logger.error(f'Drag operation failed: {message}')
|
||||
return ActionResult(error=message, include_in_memory=True)
|
||||
|
||||
# Create descriptive message
|
||||
if params.element_source and params.element_target:
|
||||
msg = f"🖱️ Dragged element '{params.element_source}' to '{params.element_target}'"
|
||||
else:
|
||||
msg = f'🖱️ Dragged from ({source_x}, {source_y}) to ({target_x}, {target_y})'
|
||||
|
||||
logger.info(msg)
|
||||
return ActionResult(extracted_content=msg, include_in_memory=True)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f'Failed to perform drag and drop: {str(e)}'
|
||||
logger.error(error_msg)
|
||||
return ActionResult(error=error_msg, include_in_memory=True)
|
||||
|
||||
# Register ---------------------------------------------------------------
|
||||
|
||||
def action(self, description: str, **kwargs):
|
||||
|
||||
@@ -96,3 +96,30 @@ class NoParamsAction(BaseModel):
|
||||
def ignore_all_inputs(cls, values):
|
||||
# No matter what the user sends, discard it and return empty.
|
||||
return {}
|
||||
|
||||
|
||||
class Position(BaseModel):
|
||||
x: int
|
||||
y: int
|
||||
|
||||
|
||||
class DragDropAction(BaseModel):
|
||||
# Element-based approach
|
||||
element_source: Optional[str] = Field(None, description='CSS selector or XPath of the element to drag from')
|
||||
element_target: Optional[str] = Field(None, description='CSS selector or XPath of the element to drop onto')
|
||||
element_source_offset: Optional[Position] = Field(
|
||||
None, description='Precise position within the source element to start drag (in pixels from top-left corner)'
|
||||
)
|
||||
element_target_offset: Optional[Position] = Field(
|
||||
None, description='Precise position within the target element to drop (in pixels from top-left corner)'
|
||||
)
|
||||
|
||||
# Coordinate-based approach (used if selectors not provided)
|
||||
coord_source_x: Optional[int] = Field(None, description='Absolute X coordinate on page to start drag from (in pixels)')
|
||||
coord_source_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to start drag from (in pixels)')
|
||||
coord_target_x: Optional[int] = Field(None, description='Absolute X coordinate on page to drop at (in pixels)')
|
||||
coord_target_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to drop at (in pixels)')
|
||||
|
||||
# Common options
|
||||
steps: Optional[int] = Field(10, description='Number of intermediate points for smoother movement (5-20 recommended)')
|
||||
delay_ms: Optional[int] = Field(5, description='Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)')
|
||||
|
||||
@@ -93,7 +93,7 @@ class SignalHandler:
|
||||
except Exception:
|
||||
# there are situations where signal handlers are not supported, e.g.
|
||||
# - when running in a thread other than the main thread
|
||||
# - some opearating systems
|
||||
# - some operating systems
|
||||
# - inside jupyter notebooks
|
||||
pass
|
||||
|
||||
|
||||
@@ -47,7 +47,8 @@ agent = Agent(
|
||||
- Disable to reduce costs or use models without vision support
|
||||
- For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size)
|
||||
- `save_conversation_path`: Path to save the complete conversation history. Useful for debugging.
|
||||
- `system_prompt_class`: Custom system prompt class. See <a href="/customize/system-prompt">System Prompt</a> for customization options.
|
||||
- `override_system_message`: Completely replace the default system prompt with a custom one.
|
||||
- `extend_system_message`: Add additional instructions to the default system prompt.
|
||||
|
||||
<Note>
|
||||
Vision capabilities are recommended for better web interaction understanding,
|
||||
@@ -183,7 +184,7 @@ agent = Agent(
|
||||
|
||||
You can configure the agent and provide a separate message to help the LLM understand the task better.
|
||||
|
||||
```python
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
agent = Agent(
|
||||
|
||||
@@ -161,8 +161,8 @@ async def run_search():
|
||||
Highlight interactive elements on the screen with colorful bounding boxes.
|
||||
|
||||
- **viewport_expansion** (default: `500`)
|
||||
Viewport expansion in pixels. With this you can controll how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
|
||||
Default is 500 pixels, that means that we inlcude a little bit more than the visible viewport inside the context.
|
||||
Viewport expansion in pixels. With this you can control how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
|
||||
Default is 500 pixels, that means that we include a little bit more than the visible viewport inside the context.
|
||||
|
||||
### Restrict URLs
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ from browser_use import Browser, Controller, ActionResult
|
||||
controller = Controller()
|
||||
@controller.action('Open website')
|
||||
async def open_website(url: str, browser: Browser):
|
||||
page = browser.get_current_page()
|
||||
page = await browser.get_current_page()
|
||||
await page.goto(url)
|
||||
return ActionResult(extracted_content='Website opened')
|
||||
```
|
||||
|
||||
@@ -43,7 +43,7 @@ if __name__ == '__main__':
|
||||
In this example:
|
||||
1. The model only sees `x_name` and `x_password` as placeholders.
|
||||
2. When the model wants to use your password it outputs x_password - and we replace it with the actual value.
|
||||
3. When your password is visable on the current page, we replace it in the LLM input - so that the model never has it in its state.
|
||||
3. When your password is visible on the current page, we replace it in the LLM input - so that the model never has it in its state.
|
||||
|
||||
Warning: Vision models still see the image of the page - where the sensitive data might be visible.
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ async def main():
|
||||
llm = ChatOpenAI(model_name='gpt-4o')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent( # disco mode will not be triggered on apple.com because the LLM wont be able to see that action available, it should work on Google.com though.
|
||||
agent = Agent( # disco mode will not be triggered on apple.com because the LLM won't be able to see that action available, it should work on Google.com though.
|
||||
task="""
|
||||
Go to apple.com and trigger disco mode (if dont know how to do that, then just move on).
|
||||
Then go to google.com and trigger disco mode.
|
||||
|
||||
@@ -21,7 +21,7 @@ async def done(text: str):
|
||||
|
||||
# To send emails use
|
||||
# STEP 1: go to https://support.google.com/accounts/answer/185833
|
||||
# STEP 2: Create an app password (you cant use here your normal gmail password)
|
||||
# STEP 2: Create an app password (you can't use here your normal gmail password)
|
||||
# STEP 3: Use the app password in the code below for the password
|
||||
yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password')
|
||||
yag.send(
|
||||
|
||||
46
examples/features/drag_drop.py
Normal file
46
examples/features/drag_drop.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from langchain_google_genai import ChatGoogleGenerativeAI
|
||||
from pydantic import SecretStr
|
||||
|
||||
from browser_use import Agent
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if not api_key:
|
||||
raise ValueError('GEMINI_API_KEY is not set')
|
||||
|
||||
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
|
||||
|
||||
|
||||
task_1 = """
|
||||
Navigate to: https://sortablejs.github.io/Sortable/.
|
||||
Then scroll down to the first examplw with title "Simple list example".
|
||||
Drag the element with name "item 1" to below the element with name "item 3".
|
||||
"""
|
||||
|
||||
|
||||
task_2 = """
|
||||
Navigate to: https://excalidraw.com/.
|
||||
Click on the pencil icon (with index 40).
|
||||
Then draw a triangle in the canvas.
|
||||
Draw the triangle starting from coordinate (400,400).
|
||||
You can use the drag and drop action to draw the triangle.
|
||||
"""
|
||||
|
||||
|
||||
async def run_search():
|
||||
agent = Agent(
|
||||
task=task_1,
|
||||
llm=llm,
|
||||
max_actions_per_step=1,
|
||||
use_vision=True,
|
||||
)
|
||||
|
||||
await agent.run(max_steps=25)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(run_search())
|
||||
@@ -13,7 +13,7 @@ from browser_use.browser.browser import Browser, BrowserConfig
|
||||
|
||||
llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
|
||||
task = (
|
||||
'go to google.com and search for openai.com and click on the first link then extract content and scroll down - whats there?'
|
||||
"go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?"
|
||||
)
|
||||
|
||||
allowed_domains = ['google.com']
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Demostrate output validator.
|
||||
Demonstrate output validator.
|
||||
|
||||
@dev You need to add OPENAI_API_KEY to your environment variables.
|
||||
"""
|
||||
|
||||
@@ -29,7 +29,7 @@ Five Steps to create and invite a Discord bot:
|
||||
* Click “Authorize”.
|
||||
--> Note: The person adding the bot needs "Manage Server" permissions.
|
||||
6. Run the code below to start the bot with your bot token.
|
||||
7. Write e.g. "/bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel.
|
||||
7. Write e.g. "/bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
@@ -38,14 +38,14 @@ Steps to create and configure a Slack bot:
|
||||
6. Invite the bot to a channel:
|
||||
* Use the `/invite @your-bot-name` command in the Slack channel where you want the bot to be active.
|
||||
7. Run the code in `examples/slack_example.py` to start the bot with your bot token and signing secret.
|
||||
8. Write e.g. "$bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel.
|
||||
8. Write e.g. "$bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel.
|
||||
|
||||
## Installing and Starting ngrok
|
||||
|
||||
To expose your local server to the internet, you can use ngrok. Follow these steps to install and start ngrok:
|
||||
|
||||
1. Download ngrok from the official website: https://ngrok.com/download
|
||||
2. Create a free account and follow the offical steps to install ngrok.
|
||||
2. Create a free account and follow the official steps to install ngrok.
|
||||
3. Start ngrok by running the following command in your terminal:
|
||||
```sh
|
||||
ngrok http 3000
|
||||
|
||||
@@ -102,7 +102,7 @@ async def input_selected_cell_text(browser: BrowserContext, text: str):
|
||||
page = await browser.get_current_page()
|
||||
|
||||
await page.keyboard.type(text, delay=0.1)
|
||||
await page.keyboard.press('Enter') # make sure to commit the input so it doesnt get overwritten by the next action
|
||||
await page.keyboard.press('Enter') # make sure to commit the input so it doesn't get overwritten by the next action
|
||||
await page.keyboard.press('ArrowUp')
|
||||
return ActionResult(extracted_content=f'Inputted text {text}', include_in_memory=False)
|
||||
|
||||
@@ -128,7 +128,7 @@ async def update_range_contents(browser: BrowserContext, range: str, new_content
|
||||
# - https://github.com/philc/sheetkeys/blob/master/content_scripts/commands.js
|
||||
# - https://support.google.com/docs/answer/181110?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac-shortcuts
|
||||
|
||||
# Tip: LLM is bad at spacial reasoning, don't make it navigate with arrow keys relative to current cell
|
||||
# Tip: LLM is bad at spatial reasoning, don't make it navigate with arrow keys relative to current cell
|
||||
# if given arrow keys, it will try to jump from G1 to A2 by pressing Down, without realizing needs to go Down+LeftLeftLeftLeft
|
||||
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ At this stage, check the basket on the top right (indicates the price) and check
|
||||
- If the total order **is below CHF 99**, add **a liquid soap refill** to reach the minimum. If it;s still you can buy some bread, dark chockolate.
|
||||
- At this step, check if you have bought MORE items than needed. If the price is more then CHF200, you MUST remove items.
|
||||
- If an item is not available, choose an alternative.
|
||||
- if an age verification is needed, remove alchoholic products, we haven't verified yet.
|
||||
- if an age verification is needed, remove alcoholic products, we haven't verified yet.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -49,6 +49,10 @@ dependencies = [
|
||||
urls = { "Repository" = "https://github.com/browser-use/browser-use" }
|
||||
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words-list = "bu"
|
||||
skip = "*.json"
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 130
|
||||
fix = true
|
||||
@@ -89,4 +93,5 @@ dev-dependencies = [
|
||||
"langchain-fireworks>=0.2.6",
|
||||
"ipdb>=0.13.13",
|
||||
"pre-commit>=4.2.0",
|
||||
"codespell>=2.4.1",
|
||||
]
|
||||
|
||||
@@ -23,7 +23,7 @@ controller = Controller()
|
||||
|
||||
# use this test to ask the model questions about the page like
|
||||
# which color do you see for bbox labels, list all with their label
|
||||
# whats the smallest bboxes with labels and
|
||||
# what's the smallest bboxes with labels and
|
||||
|
||||
|
||||
@controller.registry.action(description='explain what you see on the screen and ask user for input')
|
||||
@@ -40,7 +40,7 @@ async def done(text: str) -> str:
|
||||
|
||||
|
||||
agent = Agent(
|
||||
task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to expalin it and get the next question',
|
||||
task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to explain it and get the next question',
|
||||
llm=llm,
|
||||
controller=controller,
|
||||
browser=Browser(config=BrowserConfig(disable_security=True, headless=False)),
|
||||
|
||||
Reference in New Issue
Block a user