From 4633eb5e14025d171195705c898ef32ef78ea71e Mon Sep 17 00:00:00 2001 From: Bartlomiej Wietrak Date: Thu, 27 Feb 2025 14:25:34 +0100 Subject: [PATCH 1/9] Add http_credentials to browser context. --- browser_use/browser/context.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py index 44b7595d5..5eb802317 100644 --- a/browser_use/browser/context.py +++ b/browser_use/browser/context.py @@ -107,6 +107,10 @@ class BrowserContextConfig: include_dynamic_attributes: bool = True Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False. + + http_credentials: None + Dictionary with HTTP authentication credentials, e.g. + {"username": "bill", "password": "pa55w0rd"} """ cookies_file: str | None = None @@ -132,6 +136,7 @@ class BrowserContextConfig: viewport_expansion: int = 500 allowed_domains: list[str] | None = None include_dynamic_attributes: bool = True + http_credentials: dict[str, str] | None = None _force_keep_context_alive: bool = False @@ -326,6 +331,7 @@ class BrowserContext: record_video_dir=self.config.save_recording_path, record_video_size=self.config.browser_window_size, locale=self.config.locale, + http_credentials=self.config.http_credentials, ) if self.config.trace_path: From 6f96453794e7da521c7e6a760b15bf3ececb1c30 Mon Sep 17 00:00:00 2001 From: PaperBoardOfficial Date: Sat, 29 Mar 2025 11:01:09 +0530 Subject: [PATCH 2/9] added drag drop action --- browser_use/controller/service.py | 223 +++++++++++++++++++++++++++++- browser_use/controller/views.py | 21 +++ examples/features/drag_drop.py | 47 +++++++ 3 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 examples/features/drag_drop.py diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py index 7c3f78b45..f1faa8536 100644 --- a/browser_use/controller/service.py +++ b/browser_use/controller/service.py @@ -3,13 +3,13 @@ import enum import json import logging import re -from typing import Dict, Generic, Optional, Type, TypeVar +from typing import Dict, Generic, Optional, Tuple, Type, TypeVar, cast from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import PromptTemplate # from lmnr.sdk.laminar import Laminar -from pydantic import BaseModel +from pydantic import BaseModel, Field from browser_use.agent.views import ActionModel, ActionResult from browser_use.browser.context import BrowserContext @@ -21,11 +21,13 @@ from browser_use.controller.views import ( ClickElementByXpathAction, CloseTabAction, DoneAction, + DragDropAction, GoToUrlAction, GroupTabsAction, InputTextAction, NoParamsAction, OpenTabAction, + Position, ScrollAction, SearchGoogleAction, SendKeysAction, @@ -33,6 +35,9 @@ from browser_use.controller.views import ( UngroupTabsAction, ) from browser_use.utils import time_execution_sync +from playwright.async_api import BrowserContext as PlaywrightBrowserContext, Page, ElementHandle, Locator +from playwright.async_api import Position as PlaywrightPosition +from playwright.async_api import Error as PlaywrightError logger = logging.getLogger(__name__) @@ -552,6 +557,220 @@ class Controller(Generic[Context]): logger.error(msg) return ActionResult(error=msg, include_in_memory=True) + @self.registry.action( + 'Drag and drop elements or between coordinates on the page - useful for canvas drawing, sortable lists, sliders, file uploads, and UI rearrangement', + param_model=DragDropAction, + ) + async def drag_drop(params: DragDropAction, browser: BrowserContext) -> ActionResult: + """ + Performs a precise drag and drop operation between elements or coordinates. + """ + async def get_drag_elements( + page: Page, + source_selector: str, + target_selector: str, + ) -> Tuple[Optional[ElementHandle], Optional[ElementHandle]]: + """Get source and target elements with appropriate error handling.""" + source_element = None + target_element = None + + try: + # page.locator() auto-detects CSS and XPath + source_locator = page.locator(source_selector) + target_locator = page.locator(target_selector) + + # Check if elements exist + source_count = await source_locator.count() + target_count = await target_locator.count() + + if source_count > 0: + source_element = await source_locator.first.element_handle() + logger.debug(f"Found source element with selector: {source_selector}") + else: + logger.warning(f"Source element not found: {source_selector}") + + if target_count > 0: + target_element = await target_locator.first.element_handle() + logger.debug(f"Found target element with selector: {target_selector}") + else: + logger.warning(f"Target element not found: {target_selector}") + + except Exception as e: + logger.error(f"Error finding elements: {str(e)}") + + return source_element, target_element + + async def get_element_coordinates( + source_element: ElementHandle, + target_element: ElementHandle, + source_position: Optional[Position], + target_position: Optional[Position] + ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]: + """Get coordinates from elements with appropriate error handling.""" + source_coords = None + target_coords = None + + try: + # Get source coordinates + if source_position: + source_coords = (source_position.x, source_position.y) + else: + source_box = await source_element.bounding_box() + if source_box: + source_coords = ( + int(source_box["x"] + source_box["width"] / 2), + int(source_box["y"] + source_box["height"] / 2) + ) + + # Get target coordinates + if target_position: + target_coords = (target_position.x, target_position.y) + else: + target_box = await target_element.bounding_box() + if target_box: + target_coords = ( + int(target_box["x"] + target_box["width"] / 2), + int(target_box["y"] + target_box["height"] / 2) + ) + except Exception as e: + logger.error(f"Error getting element coordinates: {str(e)}") + + return source_coords, target_coords + + async def execute_drag_operation( + page: Page, + source_x: int, + source_y: int, + target_x: int, + target_y: int, + steps: int, + delay_ms: int, + ) -> Tuple[bool, str]: + """Execute the drag operation with comprehensive error handling.""" + try: + # Try to move to source position + try: + await page.mouse.move(source_x, source_y) + logger.debug(f"Moved to source position ({source_x}, {source_y})") + except Exception as e: + logger.error(f"Failed to move to source position: {str(e)}") + return False, f"Failed to move to source position: {str(e)}" + + # Press mouse button down + await page.mouse.down() + + # Move to target position with intermediate steps + for i in range(1, steps + 1): + ratio = i / steps + intermediate_x = int(source_x + (target_x - source_x) * ratio) + intermediate_y = int(source_y + (target_y - source_y) * ratio) + + await page.mouse.move(intermediate_x, intermediate_y) + + if delay_ms > 0: + await asyncio.sleep(delay_ms / 1000) + + # Move to final target position + await page.mouse.move(target_x, target_y) + + # Move again to ensure dragover events are properly triggered + await page.mouse.move(target_x, target_y) + + # Release mouse button + await page.mouse.up() + + return True, "Drag operation completed successfully" + + except Exception as e: + return False, f"Error during drag operation: {str(e)}" + + page = await browser.get_current_page() + + try: + # Initialize variables + source_x: Optional[int] = None + source_y: Optional[int] = None + target_x: Optional[int] = None + target_y: Optional[int] = None + + # Normalize parameters + steps = max(1, params.steps or 10) + delay_ms = max(0, params.delay_ms or 5) + + # Case 1: Element selectors provided + if params.element_source and params.element_target: + logger.debug(f"Using element-based approach with selectors") + + source_element, target_element = await get_drag_elements( + page, + params.element_source, + params.element_target, + ) + + if not source_element or not target_element: + error_msg = f"Failed to find {'source' if not source_element else 'target'} element" + return ActionResult(error=error_msg, include_in_memory=True) + + source_coords, target_coords = await get_element_coordinates( + source_element, + target_element, + params.element_source_offset, + params.element_target_offset + ) + + if not source_coords or not target_coords: + error_msg = f"Failed to determine {'source' if not source_coords else 'target'} coordinates" + return ActionResult(error=error_msg, include_in_memory=True) + + source_x, source_y = source_coords + target_x, target_y = target_coords + + # Case 2: Coordinates provided directly + elif all(coord is not None for coord in [params.coord_source_x, params.coord_source_y, params.coord_target_x, params.coord_target_y]): + logger.debug(f"Using coordinate-based approach") + source_x = params.coord_source_x + source_y = params.coord_source_y + target_x = params.coord_target_x + target_y = params.coord_target_y + else: + error_msg = "Must provide either source/target selectors or source/target coordinates" + return ActionResult(error=error_msg, include_in_memory=True) + + # Validate coordinates + if any(coord is None for coord in [source_x, source_y, target_x, target_y]): + error_msg = "Failed to determine source or target coordinates" + return ActionResult(error=error_msg, include_in_memory=True) + + # Perform the drag operation + success, message = await execute_drag_operation( + page, + cast(int, source_x), + cast(int, source_y), + cast(int, target_x), + cast(int, target_y), + steps, + delay_ms, + ) + + if not success: + logger.error(f"Drag operation failed: {message}") + return ActionResult(error=message, include_in_memory=True) + + # Create descriptive message + if params.element_source and params.element_target: + msg = f"🖱️ Dragged element '{params.element_source}' to '{params.element_target}'" + else: + msg = f"🖱️ Dragged from ({source_x}, {source_y}) to ({target_x}, {target_y})" + + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + except Exception as e: + error_msg = f"Failed to perform drag and drop: {str(e)}" + logger.error(error_msg) + return ActionResult(error=error_msg, include_in_memory=True) + + # Register --------------------------------------------------------------- def action(self, description: str, **kwargs): diff --git a/browser_use/controller/views.py b/browser_use/controller/views.py index 31c3bd833..15c0dd291 100644 --- a/browser_use/controller/views.py +++ b/browser_use/controller/views.py @@ -86,3 +86,24 @@ class NoParamsAction(BaseModel): # If you want to silently allow unknown fields at top-level, # set extra = 'allow' as well: extra = 'allow' + +class Position(BaseModel): + x: int + y: int + +class DragDropAction(BaseModel): + # Element-based approach + element_source: Optional[str] = Field(None, description="CSS selector or XPath of the element to drag from") + element_target: Optional[str] = Field(None, description="CSS selector or XPath of the element to drop onto") + element_source_offset: Optional[Position] = Field(None, description="Precise position within the source element to start drag (in pixels from top-left corner)") + element_target_offset: Optional[Position] = Field(None, description="Precise position within the target element to drop (in pixels from top-left corner)") + + # Coordinate-based approach (used if selectors not provided) + coord_source_x: Optional[int] = Field(None, description="Absolute X coordinate on page to start drag from (in pixels)") + coord_source_y: Optional[int] = Field(None, description="Absolute Y coordinate on page to start drag from (in pixels)") + coord_target_x: Optional[int] = Field(None, description="Absolute X coordinate on page to drop at (in pixels)") + coord_target_y: Optional[int] = Field(None, description="Absolute Y coordinate on page to drop at (in pixels)") + + # Common options + steps: Optional[int] = Field(10, description="Number of intermediate points for smoother movement (5-20 recommended)") + delay_ms: Optional[int] = Field(5, description="Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)") diff --git a/examples/features/drag_drop.py b/examples/features/drag_drop.py new file mode 100644 index 000000000..074bbfa88 --- /dev/null +++ b/examples/features/drag_drop.py @@ -0,0 +1,47 @@ +import asyncio +import os + +from dotenv import load_dotenv +from langchain_google_genai import ChatGoogleGenerativeAI +from pydantic import SecretStr + +from browser_use import Agent + + +load_dotenv() +api_key = os.getenv('GEMINI_API_KEY') +if not api_key: + raise ValueError('GEMINI_API_KEY is not set') + +llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key)) + + +task_1 = """ +Navigate to: https://sortablejs.github.io/Sortable/. +Then scroll down to the first examplw with title "Simple list example". +Drag the element with name "item 1" to below the element with name "item 3". +""" + + +task_2 = """ +Navigate to: https://excalidraw.com/. +Click on the pencil icon (with index 40). +Then draw a triangle in the canvas. +Draw the triangle starting from coordinate (400,400). +You can use the drag and drop action to draw the triangle. +""" + + +async def run_search(): + agent = Agent( + task=task_1, + llm=llm, + max_actions_per_step=1, + use_vision=True, + ) + + await agent.run(max_steps=25) + + +if __name__ == '__main__': + asyncio.run(run_search()) From 230eb5f62de4b3141611f2842602d6b196b36dea Mon Sep 17 00:00:00 2001 From: PaperBoardOfficial Date: Sat, 29 Mar 2025 11:10:31 +0530 Subject: [PATCH 3/9] added drag drop action --- browser_use/controller/service.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py index f1faa8536..e17fe8d44 100644 --- a/browser_use/controller/service.py +++ b/browser_use/controller/service.py @@ -9,7 +9,7 @@ from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import PromptTemplate # from lmnr.sdk.laminar import Laminar -from pydantic import BaseModel, Field +from pydantic import BaseModel from browser_use.agent.views import ActionModel, ActionResult from browser_use.browser.context import BrowserContext @@ -35,9 +35,7 @@ from browser_use.controller.views import ( UngroupTabsAction, ) from browser_use.utils import time_execution_sync -from playwright.async_api import BrowserContext as PlaywrightBrowserContext, Page, ElementHandle, Locator -from playwright.async_api import Position as PlaywrightPosition -from playwright.async_api import Error as PlaywrightError +from playwright.async_api import Page, ElementHandle logger = logging.getLogger(__name__) From e7c3f43808e177250947044e73fd3319a6d72770 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 29 Mar 2025 17:44:19 +0100 Subject: [PATCH 4/9] Fix typos discovered by codespell --- .pre-commit-config.yaml | 7 +++++++ README.md | 2 +- browser_use/agent/message_manager/service.py | 2 +- browser_use/agent/system_prompt.md | 2 +- browser_use/browser/browser.py | 4 ++-- browser_use/browser/chrome.py | 2 +- browser_use/browser/context.py | 4 ++-- browser_use/controller/service.py | 6 +++--- docs/customize/browser-settings.mdx | 4 ++-- docs/customize/sensitive-data.mdx | 2 +- examples/custom-functions/action_filters.py | 2 +- examples/custom-functions/notification.py | 2 +- examples/features/restrict_urls.py | 2 +- examples/features/validate_output.py | 2 +- examples/integrations/discord/discord_example.py | 2 +- examples/integrations/slack/README.md | 4 ++-- examples/use-cases/google_sheets.py | 4 ++-- examples/use-cases/shopping.py | 2 +- pyproject.toml | 4 ++++ tests/test_vision.py | 4 ++-- 20 files changed, 37 insertions(+), 26 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 68bfa89bc..92cc8ca59 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,3 +23,10 @@ repos: - id: detect-private-key - id: mixed-line-ending - id: fix-byte-order-marker + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell # See pyproject.toml for args + additional_dependencies: + - tomli diff --git a/README.md b/README.md index ec3693706..0860f50f8 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ Tell your computer what to do, and it gets it done. ### Rerunning tasks - [ ] LLM as fallback -- [ ] Make it easy to define workfow templates where LLM fills in the details +- [ ] Make it easy to define workflow templates where LLM fills in the details - [ ] Return playwright script from the agent ### Datasets diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index b75ea7d9d..76aab4955 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -62,7 +62,7 @@ class MessageManager: self._add_message_with_tokens(task_message, message_type='init') if self.settings.sensitive_data: - info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}' + info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}' info += 'To use them, write the placeholder name' info_message = HumanMessage(content=info) self._add_message_with_tokens(info_message, message_type='init') diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md index 0de6eab90..5f7eff0a1 100644 --- a/browser_use/agent/system_prompt.md +++ b/browser_use/agent/system_prompt.md @@ -49,7 +49,7 @@ Common action sequences: 5. TASK COMPLETION: - Use the done action as the last action as soon as the ultimate task is complete - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. -- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false! +- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false! - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step. - Don't hallucinate actions - Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task. diff --git a/browser_use/browser/browser.py b/browser_use/browser/browser.py index 782575799..28f9509b5 100644 --- a/browser_use/browser/browser.py +++ b/browser_use/browser/browser.py @@ -110,7 +110,7 @@ class Browser: """ Playwright browser on steroids. - This is persistant browser factory that can spawn multiple browser contexts. + This is persistent browser factory that can spawn multiple browser contexts. It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise). """ @@ -180,7 +180,7 @@ class Browser: # Check if browser is already running response = requests.get('http://localhost:9222/json/version', timeout=2) if response.status_code == 200: - logger.info('🔌 Re-using existing browser found running on http://localhost:9222') + logger.info('🔌 Reusing existing browser found running on http://localhost:9222') browser_class = getattr(playwright, self.config.browser_class) browser = await browser_class.connect_over_cdp( endpoint_url='http://localhost:9222', diff --git a/browser_use/browser/chrome.py b/browser_use/browser/chrome.py index 9e8ab34a6..60027b5c6 100644 --- a/browser_use/browser/chrome.py +++ b/browser_use/browser/chrome.py @@ -71,7 +71,7 @@ CHROME_DETERMINISTIC_RENDERING_ARGS = [ # chrome://gpu '--enable-webgl', # enable web-gl graphics support '--font-render-hinting=none', # make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;} - '--force-color-profile=srgb', # make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb + '--force-color-profile=srgb', # make rendering more deterministic by using consistent color profile, if browser looks weird, try: generic-rgb '--disable-partial-raster', # make rendering more deterministic (TODO: verify if still needed) '--disable-skia-runtime-opts', # make rendering more deterministic by avoiding Skia hot path runtime optimizations '--disable-2d-canvas-clip-aa', # make rendering more deterministic by disabling antialiasing on 2d canvas clips diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py index cb9f3f56e..1545374ee 100644 --- a/browser_use/browser/context.py +++ b/browser_use/browser/context.py @@ -889,7 +889,7 @@ class BrowserContext: # Get all cross-origin iframes within the page and open them in new tabs # mark the titles of the new tabs so the LLM knows to check them for additional content # unfortunately too buggy for now, too many sites use invisible cross-origin iframes for ads, tracking, youtube videos, social media, etc. - # and it distracts the bot by openeing a lot of new tabs + # and it distracts the bot by opening a lot of new tabs # iframe_urls = await dom_service.get_cross_origin_iframes() # for url in iframe_urls: # if url in [tab.url for tab in tabs_info]: @@ -1379,7 +1379,7 @@ class BrowserContext: try: tab_info = TabInfo(page_id=page_id, url=page.url, title=await asyncio.wait_for(page.title(), timeout=1)) except asyncio.TimeoutError: - # page.title() can hang forever on tabs that are crashed/dissapeared/about:blank + # page.title() can hang forever on tabs that are crashed/disappeared/about:blank # we dont want to try automating those tabs because they will hang the whole script logger.debug('⚠ Failed to get tab info for tab #%s: %s (ignoring)', page_id, page.url) tab_info = TabInfo(page_id=page_id, url='about:blank', title='ignore this tab and do not use it') diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py index dc526bc36..b167c676a 100644 --- a/browser_use/controller/service.py +++ b/browser_use/controller/service.py @@ -57,7 +57,7 @@ class Controller(Generic[Context]): data: output_model @self.registry.action( - 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached', + 'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached', param_model=ExtendedOutputModel, ) async def done(params: ExtendedOutputModel): @@ -73,7 +73,7 @@ class Controller(Generic[Context]): else: @self.registry.action( - 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached', + 'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached', param_model=DoneAction, ) async def done(params: DoneAction): @@ -299,7 +299,7 @@ class Controller(Generic[Context]): # Content Actions @self.registry.action( - 'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links', + 'Extract page content to retrieve specific information from the page, e.g. all company names, a specific description, all information about, links with companies in structured format or simply links', ) async def extract_content( goal: str, should_strip_link_urls: bool, browser: BrowserContext, page_extraction_llm: BaseChatModel diff --git a/docs/customize/browser-settings.mdx b/docs/customize/browser-settings.mdx index c5dfb6754..1d7903b9e 100644 --- a/docs/customize/browser-settings.mdx +++ b/docs/customize/browser-settings.mdx @@ -161,8 +161,8 @@ async def run_search(): Highlight interactive elements on the screen with colorful bounding boxes. - **viewport_expansion** (default: `500`) - Viewport expansion in pixels. With this you can controll how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included. - Default is 500 pixels, that means that we inlcude a little bit more than the visible viewport inside the context. + Viewport expansion in pixels. With this you can control how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included. + Default is 500 pixels, that means that we include a little bit more than the visible viewport inside the context. ### Restrict URLs diff --git a/docs/customize/sensitive-data.mdx b/docs/customize/sensitive-data.mdx index 5b012d0dd..3bb5f6c66 100644 --- a/docs/customize/sensitive-data.mdx +++ b/docs/customize/sensitive-data.mdx @@ -43,7 +43,7 @@ if __name__ == '__main__': In this example: 1. The model only sees `x_name` and `x_password` as placeholders. 2. When the model wants to use your password it outputs x_password - and we replace it with the actual value. -3. When your password is visable on the current page, we replace it in the LLM input - so that the model never has it in its state. +3. When your password is visible on the current page, we replace it in the LLM input - so that the model never has it in its state. Warning: Vision models still see the image of the page - where the sensitive data might be visible. diff --git a/examples/custom-functions/action_filters.py b/examples/custom-functions/action_filters.py index 7976ab9d3..ef7c60280 100644 --- a/examples/custom-functions/action_filters.py +++ b/examples/custom-functions/action_filters.py @@ -64,7 +64,7 @@ async def main(): llm = ChatOpenAI(model_name='gpt-4o') # Create the agent - agent = Agent( # disco mode will not be triggered on apple.com because the LLM wont be able to see that action available, it should work on Google.com though. + agent = Agent( # disco mode will not be triggered on apple.com because the LLM won't be able to see that action available, it should work on Google.com though. task=""" Go to apple.com and trigger disco mode (if dont know how to do that, then just move on). Then go to google.com and trigger disco mode. diff --git a/examples/custom-functions/notification.py b/examples/custom-functions/notification.py index 3f73b9640..971697cc2 100644 --- a/examples/custom-functions/notification.py +++ b/examples/custom-functions/notification.py @@ -21,7 +21,7 @@ async def done(text: str): # To send emails use # STEP 1: go to https://support.google.com/accounts/answer/185833 - # STEP 2: Create an app password (you cant use here your normal gmail password) + # STEP 2: Create an app password (you can't use here your normal gmail password) # STEP 3: Use the app password in the code below for the password yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password') yag.send( diff --git a/examples/features/restrict_urls.py b/examples/features/restrict_urls.py index 2f4ce0952..f481277e0 100644 --- a/examples/features/restrict_urls.py +++ b/examples/features/restrict_urls.py @@ -13,7 +13,7 @@ from browser_use.browser.browser import Browser, BrowserConfig llm = ChatOpenAI(model='gpt-4o', temperature=0.0) task = ( - 'go to google.com and search for openai.com and click on the first link then extract content and scroll down - whats there?' + "go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?" ) allowed_domains = ['google.com'] diff --git a/examples/features/validate_output.py b/examples/features/validate_output.py index afe96083e..332c4fdda 100644 --- a/examples/features/validate_output.py +++ b/examples/features/validate_output.py @@ -1,5 +1,5 @@ """ -Demostrate output validator. +Demonstrate output validator. @dev You need to add OPENAI_API_KEY to your environment variables. """ diff --git a/examples/integrations/discord/discord_example.py b/examples/integrations/discord/discord_example.py index 259e68cc6..c7435d854 100644 --- a/examples/integrations/discord/discord_example.py +++ b/examples/integrations/discord/discord_example.py @@ -29,7 +29,7 @@ Five Steps to create and invite a Discord bot: * Click “Authorize”. --> Note: The person adding the bot needs "Manage Server" permissions. 6. Run the code below to start the bot with your bot token. -7. Write e.g. "/bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel. +7. Write e.g. "/bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel. """ import os diff --git a/examples/integrations/slack/README.md b/examples/integrations/slack/README.md index deea9dba4..3184dbc14 100644 --- a/examples/integrations/slack/README.md +++ b/examples/integrations/slack/README.md @@ -38,14 +38,14 @@ Steps to create and configure a Slack bot: 6. Invite the bot to a channel: * Use the `/invite @your-bot-name` command in the Slack channel where you want the bot to be active. 7. Run the code in `examples/slack_example.py` to start the bot with your bot token and signing secret. -8. Write e.g. "$bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel. +8. Write e.g. "$bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel. ## Installing and Starting ngrok To expose your local server to the internet, you can use ngrok. Follow these steps to install and start ngrok: 1. Download ngrok from the official website: https://ngrok.com/download -2. Create a free account and follow the offical steps to install ngrok. +2. Create a free account and follow the official steps to install ngrok. 3. Start ngrok by running the following command in your terminal: ```sh ngrok http 3000 diff --git a/examples/use-cases/google_sheets.py b/examples/use-cases/google_sheets.py index 73a2a2d87..5602c8122 100644 --- a/examples/use-cases/google_sheets.py +++ b/examples/use-cases/google_sheets.py @@ -102,7 +102,7 @@ async def input_selected_cell_text(browser: BrowserContext, text: str): page = await browser.get_current_page() await page.keyboard.type(text, delay=0.1) - await page.keyboard.press('Enter') # make sure to commit the input so it doesnt get overwritten by the next action + await page.keyboard.press('Enter') # make sure to commit the input so it doesn't get overwritten by the next action await page.keyboard.press('ArrowUp') return ActionResult(extracted_content=f'Inputted text {text}', include_in_memory=False) @@ -128,7 +128,7 @@ async def update_range_contents(browser: BrowserContext, range: str, new_content # - https://github.com/philc/sheetkeys/blob/master/content_scripts/commands.js # - https://support.google.com/docs/answer/181110?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac-shortcuts -# Tip: LLM is bad at spacial reasoning, don't make it navigate with arrow keys relative to current cell +# Tip: LLM is bad at spatial reasoning, don't make it navigate with arrow keys relative to current cell # if given arrow keys, it will try to jump from G1 to A2 by pressing Down, without realizing needs to go Down+LeftLeftLeftLeft diff --git a/examples/use-cases/shopping.py b/examples/use-cases/shopping.py index 4ba67ac99..cf6e80bef 100644 --- a/examples/use-cases/shopping.py +++ b/examples/use-cases/shopping.py @@ -77,7 +77,7 @@ At this stage, check the basket on the top right (indicates the price) and check - If the total order **is below CHF 99**, add **a liquid soap refill** to reach the minimum. If it;s still you can buy some bread, dark chockolate. - At this step, check if you have bought MORE items than needed. If the price is more then CHF200, you MUST remove items. - If an item is not available, choose an alternative. -- if an age verification is needed, remove alchoholic products, we haven't verified yet. +- if an age verification is needed, remove alcoholic products, we haven't verified yet. --- diff --git a/pyproject.toml b/pyproject.toml index 7030dfdb7..a2add2367 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,10 @@ dependencies = [ urls = { "Repository" = "https://github.com/browser-use/browser-use" } +[tool.codespell] +ignore-words-list = "bu" +skip = "*.json" + [tool.ruff] line-length = 130 fix = true diff --git a/tests/test_vision.py b/tests/test_vision.py index 91c01b667..9851b1a6f 100644 --- a/tests/test_vision.py +++ b/tests/test_vision.py @@ -23,7 +23,7 @@ controller = Controller() # use this test to ask the model questions about the page like # which color do you see for bbox labels, list all with their label -# whats the smallest bboxes with labels and +# what's the smallest bboxes with labels and @controller.registry.action(description='explain what you see on the screen and ask user for input') @@ -40,7 +40,7 @@ async def done(text: str) -> str: agent = Agent( - task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to expalin it and get the next question', + task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to explain it and get the next question', llm=llm, controller=controller, browser=Browser(config=BrowserConfig(disable_security=True, headless=False)), From 7604beab16d2c21719346f390b5e8447b3da223f Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Mon, 31 Mar 2025 07:50:12 +0200 Subject: [PATCH 5/9] uv add --dev codespell --- browser_use/utils.py | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/browser_use/utils.py b/browser_use/utils.py index a2acfe083..bd3bad1fb 100644 --- a/browser_use/utils.py +++ b/browser_use/utils.py @@ -93,7 +93,7 @@ class SignalHandler: except Exception: # there are situations where signal handlers are not supported, e.g. # - when running in a thread other than the main thread - # - some opearating systems + # - some operating systems # - inside jupyter notebooks pass diff --git a/pyproject.toml b/pyproject.toml index a2add2367..2e9bf2962 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,4 +91,5 @@ dev-dependencies = [ "langchain-fireworks>=0.2.6", "ipdb>=0.13.13", "pre-commit>=4.2.0", + "codespell>=2.4.1", ] From 6da643cff1826d31065c2a0ff8fbf38f989ec362 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 31 Mar 2025 14:34:41 -0400 Subject: [PATCH 6/9] Update browser_use/browser/context.py --- browser_use/browser/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py index ec69bffd8..7646ca759 100644 --- a/browser_use/browser/context.py +++ b/browser_use/browser/context.py @@ -107,7 +107,7 @@ class BrowserContextConfig(BaseModel): Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False. http_credentials: None - Dictionary with HTTP authentication credentials, e.g. + Dictionary with HTTP basic authentication credentials for corporate intranets (only supports one set of credentials for all URLs at the moment), e.g. {"username": "bill", "password": "pa55w0rd"} is_mobile: None From ae4a423e3ddf02d73b7bd5ee7408e0e8c570d3bb Mon Sep 17 00:00:00 2001 From: PaperBoardOfficial Date: Tue, 1 Apr 2025 08:14:15 +0530 Subject: [PATCH 7/9] lint --- browser_use/browser/context.py | 6 +- browser_use/controller/service.py | 148 +++++++++++++++--------------- browser_use/controller/views.py | 40 ++++---- examples/features/drag_drop.py | 1 - 4 files changed, 100 insertions(+), 95 deletions(-) diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py index 697906596..70caddafe 100644 --- a/browser_use/browser/context.py +++ b/browser_use/browser/context.py @@ -105,10 +105,10 @@ class BrowserContextConfig(BaseModel): include_dynamic_attributes: bool = True Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False. - + http_credentials: None - Dictionary with HTTP basic authentication credentials for corporate intranets (only supports one set of credentials for all URLs at the moment), e.g. - {"username": "bill", "password": "pa55w0rd"} + Dictionary with HTTP basic authentication credentials for corporate intranets (only supports one set of credentials for all URLs at the moment), e.g. + {"username": "bill", "password": "pa55w0rd"} is_mobile: None Whether the meta viewport tag is taken into account and touch events are enabled. diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py index efe243f09..2ccabe496 100644 --- a/browser_use/controller/service.py +++ b/browser_use/controller/service.py @@ -8,6 +8,7 @@ from typing import Dict, Generic, Optional, Tuple, Type, TypeVar, cast from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import PromptTemplate +from playwright.async_api import ElementHandle, Page # from lmnr.sdk.laminar import Laminar from pydantic import BaseModel @@ -35,7 +36,6 @@ from browser_use.controller.views import ( WaitForElementAction, ) from browser_use.utils import time_execution_sync -from playwright.async_api import Page, ElementHandle logger = logging.getLogger(__name__) @@ -637,52 +637,53 @@ class Controller(Generic[Context]): async def drag_drop(params: DragDropAction, browser: BrowserContext) -> ActionResult: """ Performs a precise drag and drop operation between elements or coordinates. - """ + """ + async def get_drag_elements( - page: Page, - source_selector: str, - target_selector: str, + page: Page, + source_selector: str, + target_selector: str, ) -> Tuple[Optional[ElementHandle], Optional[ElementHandle]]: """Get source and target elements with appropriate error handling.""" source_element = None target_element = None - + try: # page.locator() auto-detects CSS and XPath source_locator = page.locator(source_selector) target_locator = page.locator(target_selector) - + # Check if elements exist source_count = await source_locator.count() target_count = await target_locator.count() - + if source_count > 0: source_element = await source_locator.first.element_handle() - logger.debug(f"Found source element with selector: {source_selector}") + logger.debug(f'Found source element with selector: {source_selector}') else: - logger.warning(f"Source element not found: {source_selector}") - + logger.warning(f'Source element not found: {source_selector}') + if target_count > 0: target_element = await target_locator.first.element_handle() - logger.debug(f"Found target element with selector: {target_selector}") + logger.debug(f'Found target element with selector: {target_selector}') else: - logger.warning(f"Target element not found: {target_selector}") - + logger.warning(f'Target element not found: {target_selector}') + except Exception as e: - logger.error(f"Error finding elements: {str(e)}") - + logger.error(f'Error finding elements: {str(e)}') + return source_element, target_element async def get_element_coordinates( source_element: ElementHandle, target_element: ElementHandle, source_position: Optional[Position], - target_position: Optional[Position] + target_position: Optional[Position], ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]: """Get coordinates from elements with appropriate error handling.""" source_coords = None target_coords = None - + try: # Get source coordinates if source_position: @@ -691,10 +692,10 @@ class Controller(Generic[Context]): source_box = await source_element.bounding_box() if source_box: source_coords = ( - int(source_box["x"] + source_box["width"] / 2), - int(source_box["y"] + source_box["height"] / 2) + int(source_box['x'] + source_box['width'] / 2), + int(source_box['y'] + source_box['height'] / 2), ) - + # Get target coordinates if target_position: target_coords = (target_position.x, target_position.y) @@ -702,12 +703,12 @@ class Controller(Generic[Context]): target_box = await target_element.bounding_box() if target_box: target_coords = ( - int(target_box["x"] + target_box["width"] / 2), - int(target_box["y"] + target_box["height"] / 2) + int(target_box['x'] + target_box['width'] / 2), + int(target_box['y'] + target_box['height'] / 2), ) except Exception as e: - logger.error(f"Error getting element coordinates: {str(e)}") - + logger.error(f'Error getting element coordinates: {str(e)}') + return source_coords, target_coords async def execute_drag_operation( @@ -724,96 +725,96 @@ class Controller(Generic[Context]): # Try to move to source position try: await page.mouse.move(source_x, source_y) - logger.debug(f"Moved to source position ({source_x}, {source_y})") + logger.debug(f'Moved to source position ({source_x}, {source_y})') except Exception as e: - logger.error(f"Failed to move to source position: {str(e)}") - return False, f"Failed to move to source position: {str(e)}" - + logger.error(f'Failed to move to source position: {str(e)}') + return False, f'Failed to move to source position: {str(e)}' + # Press mouse button down await page.mouse.down() - + # Move to target position with intermediate steps for i in range(1, steps + 1): ratio = i / steps intermediate_x = int(source_x + (target_x - source_x) * ratio) intermediate_y = int(source_y + (target_y - source_y) * ratio) - + await page.mouse.move(intermediate_x, intermediate_y) - + if delay_ms > 0: await asyncio.sleep(delay_ms / 1000) - + # Move to final target position await page.mouse.move(target_x, target_y) - + # Move again to ensure dragover events are properly triggered await page.mouse.move(target_x, target_y) - + # Release mouse button await page.mouse.up() - - return True, "Drag operation completed successfully" - - except Exception as e: - return False, f"Error during drag operation: {str(e)}" - page = await browser.get_current_page() - + return True, 'Drag operation completed successfully' + + except Exception as e: + return False, f'Error during drag operation: {str(e)}' + + page = await browser.get_current_page() + try: # Initialize variables source_x: Optional[int] = None source_y: Optional[int] = None target_x: Optional[int] = None target_y: Optional[int] = None - + # Normalize parameters steps = max(1, params.steps or 10) delay_ms = max(0, params.delay_ms or 5) - + # Case 1: Element selectors provided if params.element_source and params.element_target: - logger.debug(f"Using element-based approach with selectors") - + logger.debug('Using element-based approach with selectors') + source_element, target_element = await get_drag_elements( - page, - params.element_source, - params.element_target, + page, + params.element_source, + params.element_target, ) - + if not source_element or not target_element: - error_msg = f"Failed to find {'source' if not source_element else 'target'} element" + error_msg = f'Failed to find {"source" if not source_element else "target"} element' return ActionResult(error=error_msg, include_in_memory=True) - + source_coords, target_coords = await get_element_coordinates( - source_element, - target_element, - params.element_source_offset, - params.element_target_offset + source_element, target_element, params.element_source_offset, params.element_target_offset ) - + if not source_coords or not target_coords: - error_msg = f"Failed to determine {'source' if not source_coords else 'target'} coordinates" + error_msg = f'Failed to determine {"source" if not source_coords else "target"} coordinates' return ActionResult(error=error_msg, include_in_memory=True) - + source_x, source_y = source_coords target_x, target_y = target_coords - + # Case 2: Coordinates provided directly - elif all(coord is not None for coord in [params.coord_source_x, params.coord_source_y, params.coord_target_x, params.coord_target_y]): - logger.debug(f"Using coordinate-based approach") + elif all( + coord is not None + for coord in [params.coord_source_x, params.coord_source_y, params.coord_target_x, params.coord_target_y] + ): + logger.debug('Using coordinate-based approach') source_x = params.coord_source_x source_y = params.coord_source_y target_x = params.coord_target_x target_y = params.coord_target_y else: - error_msg = "Must provide either source/target selectors or source/target coordinates" + error_msg = 'Must provide either source/target selectors or source/target coordinates' return ActionResult(error=error_msg, include_in_memory=True) - + # Validate coordinates if any(coord is None for coord in [source_x, source_y, target_x, target_y]): - error_msg = "Failed to determine source or target coordinates" + error_msg = 'Failed to determine source or target coordinates' return ActionResult(error=error_msg, include_in_memory=True) - + # Perform the drag operation success, message = await execute_drag_operation( page, @@ -824,26 +825,25 @@ class Controller(Generic[Context]): steps, delay_ms, ) - + if not success: - logger.error(f"Drag operation failed: {message}") + logger.error(f'Drag operation failed: {message}') return ActionResult(error=message, include_in_memory=True) - + # Create descriptive message if params.element_source and params.element_target: msg = f"🖱️ Dragged element '{params.element_source}' to '{params.element_target}'" else: - msg = f"🖱️ Dragged from ({source_x}, {source_y}) to ({target_x}, {target_y})" - + msg = f'🖱️ Dragged from ({source_x}, {source_y}) to ({target_x}, {target_y})' + logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) - + except Exception as e: - error_msg = f"Failed to perform drag and drop: {str(e)}" + error_msg = f'Failed to perform drag and drop: {str(e)}' logger.error(error_msg) return ActionResult(error=error_msg, include_in_memory=True) - # Register --------------------------------------------------------------- def action(self, description: str, **kwargs): diff --git a/browser_use/controller/views.py b/browser_use/controller/views.py index 43a15959c..e1a4fa49c 100644 --- a/browser_use/controller/views.py +++ b/browser_use/controller/views.py @@ -97,23 +97,29 @@ class NoParamsAction(BaseModel): # No matter what the user sends, discard it and return empty. return {} + class Position(BaseModel): - x: int - y: int + x: int + y: int + class DragDropAction(BaseModel): - # Element-based approach - element_source: Optional[str] = Field(None, description="CSS selector or XPath of the element to drag from") - element_target: Optional[str] = Field(None, description="CSS selector or XPath of the element to drop onto") - element_source_offset: Optional[Position] = Field(None, description="Precise position within the source element to start drag (in pixels from top-left corner)") - element_target_offset: Optional[Position] = Field(None, description="Precise position within the target element to drop (in pixels from top-left corner)") - - # Coordinate-based approach (used if selectors not provided) - coord_source_x: Optional[int] = Field(None, description="Absolute X coordinate on page to start drag from (in pixels)") - coord_source_y: Optional[int] = Field(None, description="Absolute Y coordinate on page to start drag from (in pixels)") - coord_target_x: Optional[int] = Field(None, description="Absolute X coordinate on page to drop at (in pixels)") - coord_target_y: Optional[int] = Field(None, description="Absolute Y coordinate on page to drop at (in pixels)") - - # Common options - steps: Optional[int] = Field(10, description="Number of intermediate points for smoother movement (5-20 recommended)") - delay_ms: Optional[int] = Field(5, description="Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)") + # Element-based approach + element_source: Optional[str] = Field(None, description='CSS selector or XPath of the element to drag from') + element_target: Optional[str] = Field(None, description='CSS selector or XPath of the element to drop onto') + element_source_offset: Optional[Position] = Field( + None, description='Precise position within the source element to start drag (in pixels from top-left corner)' + ) + element_target_offset: Optional[Position] = Field( + None, description='Precise position within the target element to drop (in pixels from top-left corner)' + ) + + # Coordinate-based approach (used if selectors not provided) + coord_source_x: Optional[int] = Field(None, description='Absolute X coordinate on page to start drag from (in pixels)') + coord_source_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to start drag from (in pixels)') + coord_target_x: Optional[int] = Field(None, description='Absolute X coordinate on page to drop at (in pixels)') + coord_target_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to drop at (in pixels)') + + # Common options + steps: Optional[int] = Field(10, description='Number of intermediate points for smoother movement (5-20 recommended)') + delay_ms: Optional[int] = Field(5, description='Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)') diff --git a/examples/features/drag_drop.py b/examples/features/drag_drop.py index 074bbfa88..797664943 100644 --- a/examples/features/drag_drop.py +++ b/examples/features/drag_drop.py @@ -7,7 +7,6 @@ from pydantic import SecretStr from browser_use import Agent - load_dotenv() api_key = os.getenv('GEMINI_API_KEY') if not api_key: From 64030015a9c2cf1b907821e8d0fef7dc80179721 Mon Sep 17 00:00:00 2001 From: pppp606 Date: Tue, 1 Apr 2025 17:33:26 +0900 Subject: [PATCH 8/9] Add explanations override_system_message and extend_system_message --- docs/customize/agent-settings.mdx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx index 00bde30e2..501415c22 100644 --- a/docs/customize/agent-settings.mdx +++ b/docs/customize/agent-settings.mdx @@ -47,7 +47,8 @@ agent = Agent( - Disable to reduce costs or use models without vision support - For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size) - `save_conversation_path`: Path to save the complete conversation history. Useful for debugging. -- `system_prompt_class`: Custom system prompt class. See System Prompt for customization options. +- `override_system_message`: Completely replace the default system prompt with a custom one. +- `extend_system_message`: Add additional instructions to the default system prompt. Vision capabilities are recommended for better web interaction understanding, @@ -183,7 +184,7 @@ agent = Agent( You can configure the agent and provide a separate message to help the LLM understand the task better. -```python +```python from langchain_openai import ChatOpenAI agent = Agent( From 3a7490d4f00ecc08f32cb7480ea216541a373068 Mon Sep 17 00:00:00 2001 From: Oswy <74738120+oswy-cpu@users.noreply.github.com> Date: Tue, 1 Apr 2025 19:58:16 +0300 Subject: [PATCH 9/9] Update custom-functions.mdx .get_current_page() is an async function which have to be awaited. --- docs/customize/custom-functions.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/customize/custom-functions.mdx b/docs/customize/custom-functions.mdx index 082cb2774..282020449 100644 --- a/docs/customize/custom-functions.mdx +++ b/docs/customize/custom-functions.mdx @@ -55,7 +55,7 @@ from browser_use import Browser, Controller, ActionResult controller = Controller() @controller.action('Open website') async def open_website(url: str, browser: Browser): - page = browser.get_current_page() + page = await browser.get_current_page() await page.goto(url) return ActionResult(extracted_content='Website opened') ```