diff --git a/.cursor/rules/browser-use-rules.mdc b/.cursor/rules/browser-use-rules.mdc new file mode 100644 index 000000000..a4f466525 --- /dev/null +++ b/.cursor/rules/browser-use-rules.mdc @@ -0,0 +1,83 @@ +--- +description: +globs: +alwaysApply: true +--- +## ๐Ÿง  General Guidelines for Contributing to `browser-use` + +**Browser-Use** is an AI agent that autonomously interacts with the web. It takes a user-defined task, navigates web pages using Chromium via Playwright, processes HTML, and repeatedly queries a language model (like `gpt-4o`) to decide the next actionโ€”until the task is completed. + +### ๐Ÿ—‚๏ธ File Documentation + +When you create a **new file**: + +* **For humans**: At the top of the file, include a docstring in natural language explaining: + + * What this file does. + * How it fits into the browser-use system. + * If it introduces a new abstraction or replaces an old one. +* **For LLMs/AI**: Include structured metadata using standardized comments such as: + + ```python + # @file purpose: Defines + ``` + +--- + +### ๐Ÿงฐ Development Rules + +* โœ… **Always use [`uv`](mdc:https:/github.com/astral-sh/uv) instead of `pip`** + For deterministic and fast dependency installs. + +```bash +uv venv --python 3.11 +source .venv/bin/activate +uv sync +``` + +* โœ… **Use real model names** + Do **not** replace `gpt-4o` with `gpt-4`. The model `gpt-4o` is a distinct release and supported. + +* โœ… **Type-safe coding** + Use **Pydantic models** for all internal action schemas, task inputs/outputs, and controller I/O. This ensures robust validation and LLM-call integrity. + +--- + +## โš™๏ธ Adding New Actions + +To add a new action that your browser agent can execute: + +```python +from browser_use.core.controller import Controller, ActionResult + +controller = Controller() + +@controller.registry.action("Search the web for a specific query") +async def search_web(query: str): + # Implement your logic here, e.g., query a search engine and return results + result = ... + return ActionResult(extracted_content=result, include_in_memory=True) +``` + +### Notes: + +* Use descriptive names and docstrings for each action. +* Prefer returning `ActionResult` with structured content to help the agent reason better. + +--- + +## ๐Ÿง  Creating and Running an Agent + +To define a task and run a browser-use agent: + +```python +from browser_use.core.agent import Agent +from langchain.chat_models import ChatOpenAI + +task = "Find the CEO of OpenAI and return their name" +model = ChatOpenAI(model="gpt-4o") + +agent = Agent(task=task, llm=model, controller=controller) + +history = await agent.run() +``` diff --git a/.github/ISSUE_TEMPLATE/1_element_detection_bug.yml b/.github/ISSUE_TEMPLATE/1_element_detection_bug.yml index 891a867e7..8ddc019ba 100644 --- a/.github/ISSUE_TEMPLATE/1_element_detection_bug.yml +++ b/.github/ISSUE_TEMPLATE/1_element_detection_bug.yml @@ -1,6 +1,9 @@ -name: ๐ŸŽฏ Agent Page Interaction Issue +name: ๐ŸŽฏ AI Agent โœš Page Interaction Issue description: Agent fails to detect, click, scroll, input, or otherwise interact with some type of element on some page(s) labels: ["bug", "element-detection"] +title: "Interaction Issue: ..." +assignees: + - pirate body: - type: markdown attributes: @@ -11,7 +14,7 @@ body: id: version attributes: label: Browser Use Version - description: What version of the `browser-use` library are you using? (Run `uv pip show browser-use` or `git log -n 1` to find out) **DO NOT JUST WRITE `latest version` or `main`** + description: What version of `browser-use` are you using? (Run `uv pip show browser-use` or `git log -n 1`) **DO NOT JUST WRITE `latest release` or `main`** placeholder: "e.g. 0.4.45 or 62760baaefd" validations: required: true @@ -45,29 +48,32 @@ body: - type: textarea id: prompt attributes: - label: Screenshots, Description, and Task Prompt Given to Agent - description: The full task prompt you're giving the agent (redact any sensitive data) + a description of the issue and screenshots. + label: Screenshots, Description, and task prompt given to Agent + description: | + A description of the issue + screenshots, and the full task prompt you're giving the agent (redact sensitive data). + To help us fix it even faster, screenshot the Chome devtools [`Computed Styles` pane](https://developer.chrome.com/docs/devtools/css/reference#computed) for each failing element. placeholder: | - 1. go to https://example.com and click the xyz button... - 2. type "abc" in the dropdown search to find the "abc" option <- agent fails to click dropdown here - 3. Click the "Submit" button, then extract the result as JSON - ... - include relevant URLs and/or redacted screenshots of the relevant page(s) if possible + ๐ŸŽฏ High-level goal: Compare the prices of 3 items on a few different seller pages + ๐Ÿ’ฌ Agent(task=''' + 1. go to https://example.com and click the "xyz" dropdown + 2. type "abc" into search then select the "abc" option <- โŒ agent fails to select this option + 3. ... + โ˜๏ธ please include real URLs ๐Ÿ”— and screenshots ๐Ÿ“ธ when possible! validations: required: true - type: textarea id: html attributes: - label: HTML around where it's failing + label: "HTML around where it's failing" description: A snippet of the HTML from the failing page around where the Agent is failing to interact. render: html placeholder: | -
+
Click me
- + ...
validations: @@ -76,11 +82,11 @@ body: - type: input id: os attributes: - label: Operating System - description: What operating system are you using? - placeholder: "e.g., macOS 13.1, Windows 11, Ubuntu 22.04" + label: Operating System & Browser Versions + description: What operating system and browser are you using? + placeholder: "e.g. Ubuntu 24.04 + playwright chromium v136, Windows 11 + Chrome.exe v133, macOS ..." validations: - required: true + required: false - type: textarea id: code @@ -90,13 +96,15 @@ body: render: python placeholder: | from dotenv import load_dotenv - load_dotenv() + load_dotenv() # tip: always load_dotenv() before other imports from browser_use import Agent, BrowserSession, Controller from langchain_openai import ChatOpenAI - llm = ChatOpenAI(model="gpt-4o") - browser_session = BrowserSession(executable_path='/usr/bin/google-chrome') - agent = Agent(llm=llm, browser_session=browser_session) + agent = Agent( + task='...', + llm=ChatOpenAI(model="gpt-4o"), + browser_session=BrowserSession(headless=False), + ) ... - type: textarea @@ -114,3 +122,15 @@ body: DEBUG [langsmith.client] Sending multipart request with context: trace=91282a01-6667-48a1-8cd7-21aa9337a580,id=91282a01-6667-48a1-8cd7-21aa9337a580 DEBUG [agent] ๐Ÿชช LLM API keys OPENAI_API_KEY work, ChatOpenAI model is connected & responding correctly. ... + + - type: markdown + attributes: + value: | + --- + > [!IMPORTANT] + > ๐Ÿ™ Please **go check *right now before submitting* that that you are on the [โฌ†๏ธ LATEST VERSION](https://github.com/browser-use/browser-use/releases)**. + > ๐Ÿš€ We ship new agent and element detection improvements every day and we might've already fixed your issue! + > + > If you are running an old version, the **first thing we will ask you to do is *try the latest `beta`***: + > - ๐Ÿ†• [`beta`](https://docs.browser-use.com/development/local-setup): `uv pip install --upgrade git+https://github.com/browser-use/browser-use.git@main` + > - ๐Ÿ“ฆ [`stable`](https://pypi.org/project/browser-use/#history): `uv pip install --upgrade browser-use` diff --git a/.github/ISSUE_TEMPLATE/2_bug_report.yml b/.github/ISSUE_TEMPLATE/2_bug_report.yml index 3488d4e1b..376cd7d36 100644 --- a/.github/ISSUE_TEMPLATE/2_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/2_bug_report.yml @@ -1,6 +1,9 @@ -name: ๐Ÿ› Library Bug Report +name: ๐Ÿ‘พ Library Bug Report description: Report a bug in the browser-use Python library labels: ["bug", "triage"] +title: "Bug: ..." +assignees: + - pirate body: # - type: markdown # attributes: @@ -11,7 +14,7 @@ body: id: version attributes: label: Browser Use Version - description: What version of the `browser-use` library are you using? (Run `uv pip show browser-use` or `git log -n 1` to find out) **DO NOT JUST WRITE `latest version` or `main`** + description: What version of `browser-use` are you using? (Run `uv pip show browser-use` or `git log -n 1`) **DO NOT JUST WRITE `latest` or `main`** placeholder: "e.g. 0.4.45 or 62760baaefd" validations: required: true @@ -37,13 +40,15 @@ body: render: python placeholder: | from dotenv import load_dotenv - load_dotenv() + load_dotenv() # tip: always load_dotenv() before other imports from browser_use import Agent, BrowserSession, Controller from langchain_openai import ChatOpenAI - llm = ChatOpenAI(model="gpt-4o") - browser_session = BrowserSession(executable_path='/usr/bin/google-chrome') - agent = Agent(llm=llm, browser_session=browser_session) + agent = Agent( + task='...', + llm=ChatOpenAI(model="gpt-4o"), + browser_session=BrowserSession(headless=False), + ) ... - type: dropdown @@ -75,9 +80,9 @@ body: - type: input id: os attributes: - label: Operating System - description: What operating system are you using? - placeholder: "e.g., macOS 13.1, Windows 11, Ubuntu 22.04" + label: Operating System & Browser Versions + description: What operating system and browser are you using? + placeholder: "e.g. Ubuntu 24.04 + playwright chromium v136, Windows 11 + Chrome.exe v133, macOS ..." validations: required: true @@ -96,3 +101,15 @@ body: DEBUG [langsmith.client] Sending multipart request with context: trace=91282a01-6667-48a1-8cd7-21aa9337a580,id=91282a01-6667-48a1-8cd7-21aa9337a580 DEBUG [agent] ๐Ÿชช LLM API keys OPENAI_API_KEY work, ChatOpenAI model is connected & responding correctly. ... + + - type: markdown + attributes: + value: | + --- + > [!IMPORTANT] + > ๐Ÿ™ Please **go check *right now before submitting* that that you are on the [โฌ†๏ธ LATEST VERSION](https://github.com/browser-use/browser-use/releases)**. + > ๐Ÿš€ We ship changes every day and we might've already fixed your issue yesterday! + > + > If you are running an old version, the **first thing we will ask you to do is *try the latest `beta`***: + > - ๐Ÿ†• [`beta`](https://docs.browser-use.com/development/local-setup): `uv pip install --upgrade git+https://github.com/browser-use/browser-use.git@main` + > - ๐Ÿ“ฆ [`stable`](https://pypi.org/project/browser-use/#history): `uv pip install --upgrade browser-use` diff --git a/.github/ISSUE_TEMPLATE/3_feature_request.yml b/.github/ISSUE_TEMPLATE/3_feature_request.yml index 77127b024..f26793237 100644 --- a/.github/ISSUE_TEMPLATE/3_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_feature_request.yml @@ -1,10 +1,10 @@ -name: ๐Ÿ’ก Feature or enhancement request +name: ๐Ÿ’ก New Feature or Enhancement Request description: Suggest an idea or improvement for the browser-use library or Agent capabilities title: "Feature Request: ..." assignees: - pirate type: 'Enhancement' -labels: 'enhancement' +labels: ['enhancement'] body: - type: textarea id: current_problem @@ -24,9 +24,9 @@ body: description: | Describe the ideal specific solution you'd want, *and whether it fits into any broader scope of changes*. placeholder: | - e.g. I want to add a default controller action that can hover/drag the mouse on a path when given a series - of x,y coordinates. More broadly it may be useful add a computer-use x,y-coordinate style automation - fallback method that can do complex mouse interaction tasks. + e.g. I want to add a default action that can hover/drag the mouse on a path when given a series + of x,y coordinates. More broadly it may be useful add a computer-use/x,y-coordinate-style automation + method fallback that can do complex mouse movements. validations: required: true @@ -48,10 +48,10 @@ body: attributes: label: What version of browser-use are you currently using? description: | - Run `pip show browser-use` or `git log -n 1` and share the exact number of git hash. DO NOT JUST ENTER "the latest release" OR "main". - We need to know what version of the browser-use library you're currently running in order to contextualize your feature request. - Sometimes we've already added your feature in a newer version, sometimes features already exist but may not be available in your specific environment. - placeholder: 0.1.48 + Run `pip show browser-use` or `git log -n 1` and share the exact number or git hash. DO NOT JUST ENTER `latest release` OR `main`. + We need to know what version of the browser-use library you're running in order to contextualize your feature request. + Sometimes features are already available and just need to be enabled with config on certain versions. + placeholder: "e.g. 0.1.48 or 62760baaefd" validations: required: true @@ -59,8 +59,13 @@ body: attributes: value: | --- - > [!TIP] - > ๐Ÿš€ Please ***double-check you are on the [latest release](https://github.com/browser-use/browser-use/releases)***, we might've already shipped your feature! + > [!IMPORTANT] + > ๐Ÿ™ Please **go check *right now before submitting* that that you have tried the [โฌ†๏ธ LATEST VERSION](https://github.com/browser-use/browser-use/releases)**. + > ๐Ÿš€ We ship new features every day and we might've already added a solution to your need yesterday! + > + > If you are running an old version, the **first thing we will ask you to do is *try the latest `beta`***: + > - ๐Ÿ†• [`beta`](https://docs.browser-use.com/development/local-setup): `uv pip install --upgrade git+https://github.com/browser-use/browser-use.git@main` + > - ๐Ÿ“ฆ [`stable`](https://pypi.org/project/browser-use/#history): `uv pip install --upgrade browser-use` - type: checkboxes id: priority @@ -71,11 +76,11 @@ body: required: false - label: "It's important to add it in the near-mid term future" required: false - - label: "It would be nice to have eventually" + - label: "It would be nice to add it sometime in the next 2 years" required: false - - label: "I'm willing to [start a PR](https://docs.browser-use.com/development/contribution-guide) to develop this myself" + - label: "๐Ÿ’ช I'm willing to [start a PR](https://docs.browser-use.com/development/contribution-guide) to work on this myself" required: false - - label: "My company would spend >$5k/mo on Browser-Use Cloud if it solved this need completely for us" + - label: "๐Ÿ’ผ My company would spend >$5k on [Browser-Use Cloud](https://browser-use.com) if it solved this reliably for us" required: false - type: markdown @@ -83,8 +88,8 @@ body: value: | --- > [!TIP] - > Start discussions about your feature request in other places too, - > the more ๐Ÿ“ฃ hype we see around a request the more likely we are to add it! + > Start conversations about your feature request in other places too, the more + > ๐Ÿ“ฃ hype we see around a request the more likely we are to add it! > - > - ๐Ÿ’ฌ Discord: [https://link.browser-use.com/discord](https://link.browser-use.com/discord) - > - ๐Ÿฆ‹ Twitter/X: [https://x.com/browser_use](https://x.com/browser_use) + > - ๐Ÿ‘พ Discord: [https://link.browser-use.com/discord](https://link.browser-use.com/discord) + > - ๐• Twitter: [https://x.com/browser_use](https://x.com/browser_use) diff --git a/.github/ISSUE_TEMPLATE/4_docs_issue.yml b/.github/ISSUE_TEMPLATE/4_docs_issue.yml index aa88e8071..bd9a9f43e 100644 --- a/.github/ISSUE_TEMPLATE/4_docs_issue.yml +++ b/.github/ISSUE_TEMPLATE/4_docs_issue.yml @@ -1,11 +1,12 @@ name: ๐Ÿ“š Documentation Issue description: Report an issue in the browser-use documentation labels: ["documentation"] +title: "Documentation: ..." body: - type: markdown attributes: value: | - Thanks for taking the time to improve our documentation! Please fill out the form below to help us understand the issue. + Thanks for taking the time to improve our documentation! Please fill out the form below to help us fix the issue quickly. - type: dropdown id: type @@ -26,7 +27,7 @@ body: attributes: label: Documentation Page description: Which page or section of the documentation is this about? - placeholder: "e.g., https://docs.browser-use.com/getting-started or Installation Guide" + placeholder: "e.g. https://docs.browser-use.com/customize/browser-settings > Context Configuration > headless" validations: required: true @@ -34,8 +35,8 @@ body: id: description attributes: label: Issue Description - description: Describe what's wrong or missing in the documentation - placeholder: The documentation should... + description: "Describe what's wrong or missing in the documentation" + placeholder: e.g. Docs should clarify whether BrowserSession(no_viewport=False) is supported when running in BrowserSession(headless=False) mode... validations: required: true @@ -45,11 +46,10 @@ body: label: Suggested Changes description: If you have specific suggestions for how to improve the documentation, please share them placeholder: | - The documentation could be improved by... - - Example: - ```python - # Your suggested code example or text here + e.g. The documentation could be improved by adding one more line here: + ```diff + Use `BrowserSession(headless=False)` to open the browser window (aka headful mode). + + Viewports are not supported when headful, if `headless=False` it will force `no_viewport=True`. ``` validations: - required: true + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 16019e944..cab5af86d 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,9 +1,9 @@ blank_issues_enabled: false # Set to true if you want to allow blank issues contact_links: - - name: ๐Ÿค” Quickstart Guide + - name: ๐Ÿ”ข Quickstart Guide url: https://docs.browser-use.com/quickstart about: Most common issues can be resolved by following our quickstart guide - - name: ๐Ÿค” Questions and Help + - name: ๐Ÿ’ฌ Questions and Help url: https://link.browser-use.com/discord about: Please ask questions in our Discord community - name: ๐Ÿ“– Documentation diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 74bcd8ac3..8b9d42393 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,4 +1,6 @@ name: test +permissions: + contents: read on: push: @@ -12,42 +14,30 @@ on: workflow_dispatch: jobs: + find_tests: + runs-on: ubuntu-latest + outputs: + filename_list: ${{ steps.list_test_files.outputs.filename_list }} # ./tests/ci/test_controller.py, ./tests/ci/test_browser.py, etc. + steps: + - uses: actions/checkout@v4 + - id: list_test_files + run: echo "::set-output name=filename_list::$(ls tests/ci/*.py | jq -R -s -c 'split("\n")[:-1]')" + # https://code.dblock.org/2021/09/03/generating-task-matrix-by-looping-over-repo-files-with-github-actions.html + tests: - name: ${{matrix.test}} + name: ${{matrix.test_filename}} runs-on: ubuntu-latest env: IN_DOCKER: 'True' strategy: matrix: - test: - # TODO: - # - browser/patchright - # - browser/playwright - # - browser/user_binary - # - browser/remote_cdp - # - models/openai - # - models/google - # - models/anthropic - # - models/azure - # - models/deepseek - # - models/grok - # - functionality/click - # - functionality/tabs - # - functionality/input - # - functionality/scroll - # - functionality/upload - # - functionality/download - # - functionality/save - # - functionality/vision - # - functionality/memory - # - functionality/planner - # - functionality/hooks - - test_browser - - test_controller - - test_browser_session - - test_tab_management - - test_sensitive_data - - test_url_allowlist_security + test_filename: ${{ fromJson(needs.find_tests.outputs.filename_list) }} + # autodiscovers all the files in tests/ci/test_*.py + # - test_browser + # - test_controller + # - test_browser_session + # - test_tab_management + # ... and more steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 @@ -57,7 +47,7 @@ jobs: - run: uv sync - - name: Detect installed Playwright or Patchright version + - name: Detect installed Playwright version run: echo "PLAYWRIGHT_VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_ENV - name: Cache playwright binaries @@ -70,4 +60,4 @@ jobs: - run: playwright install chrome - run: playwright install chromium - - run: pytest tests/${{ matrix.test }}.py + - run: pytest tests/ci/${{ matrix.test_filename }}.py diff --git a/.gitignore b/.gitignore index 2136e926e..c26532221 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,5 @@ private_example.py private_example uv.lock +temp +tmp diff --git a/bin/lint.sh b/bin/lint.sh new file mode 100755 index 000000000..8a6029dbb --- /dev/null +++ b/bin/lint.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# This script is used to run the formatter, linter, and type checker pre-commit hooks. +# Usage: +# $ ./bin/lint.sh + +IFS=$'\n' + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +cd "$SCRIPT_DIR/.." || exit 1 + +exec uv run pre-commit run --all-files diff --git a/bin/setup.sh b/bin/setup.sh new file mode 100755 index 000000000..83512bbe7 --- /dev/null +++ b/bin/setup.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# This script is used to setup a local development environment for the browser-use project. +# Usage: +# $ ./bin/setup.sh + +### Bash Environment Setup +# http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html +# set -o xtrace +# set -x +# shopt -s nullglob +set -o errexit +set -o errtrace +set -o nounset +set -o pipefail +IFS=$'\n' + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$SCRIPT_DIR" + + +if [ -f "$SCRIPT_DIR/lint.sh" ]; then + echo "[โˆš] already inside a cloned browser-use repo" +else + echo "[+] Cloning browser-use repo into current directory: $SCRIPT_DIR" + git clone https://github.com/browser-use/browser-use + cd browser-use +fi + +echo "[+] Installing uv..." +curl -LsSf https://astral.sh/uv/install.sh | sh + +#git checkout main git pull +echo +echo "[+] Setting up venv" +uv venv +echo +echo "[+] Installing packages in venv" +uv sync --dev --all-extras +echo +echo "[i] Tip: make sure to set BROWSER_USE_LOGGING_LEVEL=debug and your LLM API keys in your .env file" +echo +uv pip show browser-use + +echo "Usage:" +echo " $ browser-use use the CLI" +echo " or" +echo " $ source .venv/bin/activate" +echo " $ ipython use the library" +echo " >>> from browser_use import BrowserSession, Agent" +echo " >>> await Agent(task='book me a flight to fiji', browser=BrowserSession(headless=False)).run()" +echo "" diff --git a/bin/test.sh b/bin/test.sh new file mode 100755 index 000000000..4d2c33c15 --- /dev/null +++ b/bin/test.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# This script is used to run all the main project tests that run on CI via .github/workflows/test.yaml. +# Usage: +# $ ./bin/test.sh + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$SCRIPT_DIR/.." || exit 1 + +exec uv run pytest tests/ci diff --git a/browser_use/__init__.py b/browser_use/__init__.py index 4a9f3400e..46612163d 100644 --- a/browser_use/__init__.py +++ b/browser_use/__init__.py @@ -1,11 +1,3 @@ -import warnings - -# Suppress specific deprecation warnings from FAISS -warnings.filterwarnings('ignore', category=DeprecationWarning, module='faiss.loader') -warnings.filterwarnings('ignore', message='builtin type SwigPyPacked has no __module__ attribute') -warnings.filterwarnings('ignore', message='builtin type SwigPyObject has no __module__ attribute') -warnings.filterwarnings('ignore', message='builtin type swigvarlink has no __module__ attribute') - from browser_use.logging_config import setup_logging setup_logging() diff --git a/browser_use/agent/memory/service.py b/browser_use/agent/memory/service.py index e1913a123..082ec1202 100644 --- a/browser_use/agent/memory/service.py +++ b/browser_use/agent/memory/service.py @@ -89,7 +89,7 @@ class Memory: Args: current_step: The current step number of the agent """ - logger.info(f'Creating procedural memory at step {current_step}') + logger.debug(f'Creating procedural memory at step {current_step}') # Get all messages all_messages = self.message_manager.state.history.messages @@ -108,7 +108,7 @@ class Memory: # Need at least 2 messages to create a meaningful summary if len(messages_to_process) <= 1: - logger.info('Not enough non-memory messages to summarize') + logger.debug('Not enough non-memory messages to summarize') return # Create a procedural memory memory_content = self._create([m.message for m in messages_to_process], current_step) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index 37f198c54..999e6ba6b 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import re +import textwrap from langchain_core.messages import ( AIMessage, @@ -26,7 +28,8 @@ class MessageManagerSettings(BaseModel): image_tokens: int = 800 include_attributes: list[str] = [] message_context: str | None = None - sensitive_data: dict[str, str] | None = None + # Support both old format {key: value} and new format {domain: {key: value}} + sensitive_data: dict[str, str | dict[str, str]] | None = None available_file_paths: list[str] | None = None @@ -180,18 +183,134 @@ class MessageManager: msg = AIMessage(content=plan) self._add_message_with_tokens(msg, position) + def _get_message_emoji(self, message_type: str) -> str: + """Get emoji for a message type""" + emoji_map = { + 'HumanMessage': '๐Ÿ’ฌ', + 'AIMessage': '๐Ÿง ', + 'ToolMessage': '๐Ÿ”จ', + } + return emoji_map.get(message_type, '๐ŸŽฎ') + + def _clean_whitespace(self, text: str) -> str: + """Replace all repeated whitespace with single space and strip""" + return re.sub(r'\s+', ' ', text).strip() + + def _truncate_text(self, text: str, max_length: int) -> str: + """Truncate text to max_length and add ellipsis if needed""" + if len(text) <= max_length: + return text + return text[:max_length] + '...' + + def _extract_text_from_list_content(self, content: list) -> str: + """Extract text from list content structure""" + text_content = '' + for item in content: + if isinstance(item, dict) and 'text' in item: + text_content += item['text'] + return text_content + + def _format_agent_output_content(self, tool_call: dict) -> str: + """Format AgentOutput tool call into readable content""" + args = tool_call.get('args', {}) + action_info = '' + + # Get action name + if 'action' in args and args['action']: + first_action = args['action'][0] if isinstance(args['action'], list) and args['action'] else args['action'] + if isinstance(first_action, dict): + action_name = next(iter(first_action.keys())) if first_action else 'unknown' + action_info = f'{action_name}()' + + # Get goal + goal_info = '' + if 'current_state' in args and isinstance(args['current_state'], dict): + next_goal = args['current_state'].get('next_goal', '').strip() + if next_goal: + goal_info = f': {self._truncate_text(next_goal, 40)}' + + # Combine action and goal info + if action_info and goal_info: + return f'{action_info}{goal_info}' + elif action_info: + return action_info + elif goal_info: + return goal_info[2:] # Remove ': ' prefix for goal-only + else: + return 'AgentOutput' + + def _generate_history_log(self) -> str: + """Generate a formatted log string of message history for debugging / printing to terminal""" + total_input_tokens = 0 + message_lines = [] + + for i, m in enumerate(self.state.history.messages): + total_input_tokens += m.metadata.tokens + is_last_message = i == len(self.state.history.messages) - 1 + + # Get emoji based on message type + message_type = m.message.__class__.__name__ + emoji = self._get_message_emoji(message_type) + + # Extract content based on message structure + if is_last_message and message_type == 'HumanMessage' and isinstance(m.message.content, list): + # Special handling for last message with list content + text_content = self._extract_text_from_list_content(m.message.content) + text_content = self._clean_whitespace(text_content) + + # Look for current state section + if '[Current state starts here]' in text_content: + start_idx = text_content.find('[Current state starts here]') + content = self._truncate_text(text_content[start_idx:], 150) + else: + content = self._truncate_text(text_content, 150) + else: + # Standard content extraction + content = self._clean_whitespace(str(m.message.content)[:80]) + + # Shorten "Action result:" to "Result:" for display + if content.startswith('Action result:'): + content = 'Result:' + content[14:] + + # Handle AIMessages with tool calls + if hasattr(m.message, 'tool_calls') and m.message.tool_calls and not content: + tool_call = m.message.tool_calls[0] + tool_name = tool_call.get('name', 'unknown') + + if tool_name == 'AgentOutput': + content = self._format_agent_output_content(tool_call) + else: + content = f'[TOOL: {tool_name}]' + elif len(str(m.message.content)) > 80: + content += '...' + + # Format the message line + left_part = f' {emoji}[{m.metadata.tokens}]' + + # For last message, allow multiple lines if needed + if is_last_message and '\n' not in content: + wrapped = textwrap.wrap(content, width=80, subsequent_indent=' ' * 20) + if len(wrapped) > 2: + wrapped = wrapped[:2] + wrapped[-1] = self._truncate_text(wrapped[-1], 77) + message_lines.append(f'{left_part.ljust(16)}: {wrapped[0]}') + message_lines.extend(wrapped[1:]) + else: + message_lines.append(f'{left_part.ljust(16)}: {content}') + + # Build final log message + return ( + f'๐Ÿ“œ LLM Message history ({len(self.state.history.messages)} messages, {total_input_tokens} tokens):\n' + + '\n'.join(message_lines) + ) + @time_execution_sync('--get_messages') def get_messages(self) -> list[BaseMessage]: """Get current message list, potentially trimmed to max tokens""" - msg = [m.message for m in self.state.history.messages] - # debug which messages are in history with token count # log - total_input_tokens = 0 - logger.debug(f'Messages in history: {len(self.state.history.messages)}:') - for m in self.state.history.messages: - total_input_tokens += m.metadata.tokens - logger.debug(f'{m.message.__class__.__name__} - Token count: {m.metadata.tokens}') - logger.debug(f'Total input tokens: {total_input_tokens}') + + # Log message history for debugging + logger.debug(self._generate_history_log()) return msg @@ -218,16 +337,27 @@ class MessageManager: if not self.settings.sensitive_data: return value - # Create a dictionary with all key-value pairs from sensitive_data where value is not None or empty - valid_sensitive_data = {k: v for k, v in self.settings.sensitive_data.items() if v} + # Collect all sensitive values, immediately converting old format to new format + sensitive_values: dict[str, str] = {} + + # Process all sensitive data entries + for key_or_domain, content in self.settings.sensitive_data.items(): + if isinstance(content, dict): + # Already in new format: {domain: {key: value}} + for key, val in content.items(): + if val: # Skip empty values + sensitive_values[key] = val + elif content: # Old format: {key: value} - convert to new format internally + # We treat this as if it was {'http*://*': {key_or_domain: content}} + sensitive_values[key_or_domain] = content # If there are no valid sensitive data entries, just return the original value - if not valid_sensitive_data: + if not sensitive_values: logger.warning('No valid entries found in sensitive_data dictionary') return value # Replace all valid sensitive data values with their placeholder tags - for key, val in valid_sensitive_data.items(): + for key, val in sensitive_values.items(): value = value.replace(val, f'{key}') return value diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 55517bdaf..1ac928202 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -13,6 +13,8 @@ from typing import Any, Generic, TypeVar from dotenv import load_dotenv +from browser_use.browser.session import DEFAULT_BROWSER_PROFILE + load_dotenv() from langchain_core.language_models.chat_models import BaseChatModel @@ -70,21 +72,19 @@ logger = logging.getLogger(__name__) SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1' -def log_response(response: AgentOutput) -> None: +def log_response(response: AgentOutput, registry=None) -> None: """Utility function to log the model's response.""" if 'Success' in response.current_state.evaluation_previous_goal: emoji = '๐Ÿ‘' elif 'Failed' in response.current_state.evaluation_previous_goal: - emoji = 'โš ' + emoji = 'โš ๏ธ' else: - emoji = '๐Ÿคท' + emoji = 'โ“' logger.info(f'{emoji} Eval: {response.current_state.evaluation_previous_goal}') logger.info(f'๐Ÿง  Memory: {response.current_state.memory}') logger.info(f'๐ŸŽฏ Next goal: {response.current_state.next_goal}') - for i, action in enumerate(response.action): - logger.info(f'๐Ÿ› ๏ธ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}') Context = TypeVar('Context') @@ -105,7 +105,7 @@ class Agent(Generic[Context]): browser_session: BrowserSession | None = None, controller: Controller[Context] = Controller(), # Initial agent run parameters - sensitive_data: dict[str, str] | None = None, + sensitive_data: dict[str, str | dict[str, str]] | None = None, initial_actions: list[dict[str, dict[str, Any]]] | None = None, # Cloud Callbacks register_new_step_callback: ( @@ -227,15 +227,15 @@ class Agent(Generic[Context]): self.settings.use_vision_for_planner = False logger.info( - f'๐Ÿง  Starting a v{self.version} agent with main_model={self.model_name}' + f'๐Ÿง  Starting a browser-use agent {self.version} with base_model={self.model_name}' f'{" +tools" if self.tool_calling_method == "function_calling" else ""}' f'{" +rawtools" if self.tool_calling_method == "raw" else ""}' f'{" +vision" if self.settings.use_vision else ""}' - f'{" +memory" if self.enable_memory else ""}, ' - f'planner_model={self.planner_model_name}' + f'{" +memory" if self.enable_memory else ""}' + f' extraction_model={getattr(self.settings.page_extraction_llm, "model_name", None)}' + f'{f" planner_model={self.planner_model_name}" if self.planner_model_name else ""}' f'{" +reasoning" if self.settings.is_planner_reasoning else ""}' - f'{" +vision" if self.settings.use_vision_for_planner else ""}, ' - f'extraction_model={getattr(self.settings.page_extraction_llm, "model_name", None)} ' + f'{" +vision" if self.settings.use_vision_for_planner else ""} ' ) # Verify we can connect to the LLM @@ -291,29 +291,71 @@ class Agent(Generic[Context]): assert not (browser_profile and browser_context), 'Cannot provide both browser_profile and browser_context' assert not (browser and browser_context), 'Cannot provide both browser and browser_context' assert not (browser_session and browser_context), 'Cannot provide both browser_session and browser_context' - + browser_profile = browser_profile or DEFAULT_BROWSER_PROFILE self.browser_session = browser_session or BrowserSession( profile=browser_profile, browser=browser, browser_context=browser_context ) - if self.sensitive_data and not self.browser_profile.allowed_domains: - logger.error( - 'โš ๏ธโš ๏ธโš ๏ธ Agent(sensitive_data=โ€ขโ€ขโ€ขโ€ขโ€ขโ€ขโ€ขโ€ข) was provided but BrowserSession(allowed_domains=[...]) is not locked down! โš ๏ธโš ๏ธโš ๏ธ\n' - ' โ˜ ๏ธ If the agent visits a malicious website and encounters a prompt-injection attack, your sensitive_data may be exposed!\n\n' - ' https://docs.browser-use.com/customize/browser-settings#restrict-urls\n' - 'Waiting 10 seconds before continuing... Press [Ctrl+C] to abort.' - ) - if sys.stdin.isatty(): - try: - time.sleep(10) - except KeyboardInterrupt: - print( - '\n\n ๐Ÿ›‘ Exiting now... set BrowserSession(allowed_domains=["example.com", "example.org"]) to only domains you trust to see your sensitive_data.' - ) - sys.exit(0) - else: - pass # no point waiting if we're not in an interactive shell - logger.warning('โ€ผ๏ธ Continuing with insecure settings for now... but this will become a hard error in the future!') + if self.sensitive_data: + # Check if sensitive_data has domain-specific credentials + has_domain_specific_credentials = any(isinstance(v, dict) for v in self.sensitive_data.values()) + + # If no allowed_domains are configured, show a security warning + if not self.browser_profile.allowed_domains: + logger.error( + 'โš ๏ธโš ๏ธโš ๏ธ Agent(sensitive_data=โ€ขโ€ขโ€ขโ€ขโ€ขโ€ขโ€ขโ€ข) was provided but BrowserSession(allowed_domains=[...]) is not locked down! โš ๏ธโš ๏ธโš ๏ธ\n' + ' โ˜ ๏ธ If the agent visits a malicious website and encounters a prompt-injection attack, your sensitive_data may be exposed!\n\n' + ' https://docs.browser-use.com/customize/browser-settings#restrict-urls\n' + 'Waiting 10 seconds before continuing... Press [Ctrl+C] to abort.' + ) + if sys.stdin.isatty(): + try: + time.sleep(10) + except KeyboardInterrupt: + print( + '\n\n ๐Ÿ›‘ Exiting now... set BrowserSession(allowed_domains=["example.com", "example.org"]) to only domains you trust to see your sensitive_data.' + ) + sys.exit(0) + else: + pass # no point waiting if we're not in an interactive shell + logger.warning('โ€ผ๏ธ Continuing with insecure settings for now... but this will become a hard error in the future!') + + # If we're using domain-specific credentials, validate domain patterns + elif has_domain_specific_credentials: + # For domain-specific format, ensure all domain patterns are included in allowed_domains + domain_patterns = [k for k, v in self.sensitive_data.items() if isinstance(v, dict)] + + # Validate each domain pattern against allowed_domains + for domain_pattern in domain_patterns: + is_allowed = False + for allowed_domain in self.browser_profile.allowed_domains: + # Special cases that don't require URL matching + if domain_pattern == allowed_domain or allowed_domain == '*': + is_allowed = True + break + + # Need to create example URLs to compare the patterns + # Extract the domain parts, ignoring scheme + pattern_domain = domain_pattern.split('://')[-1] if '://' in domain_pattern else domain_pattern + allowed_domain_part = allowed_domain.split('://')[-1] if '://' in allowed_domain else allowed_domain + + # Check if pattern is covered by an allowed domain + # Example: "google.com" is covered by "*.google.com" + if pattern_domain == allowed_domain_part or ( + allowed_domain_part.startswith('*.') + and ( + pattern_domain == allowed_domain_part[2:] + or pattern_domain.endswith('.' + allowed_domain_part[2:]) + ) + ): + is_allowed = True + break + + if not is_allowed: + logger.warning( + f'โš ๏ธ Domain pattern "{domain_pattern}" in sensitive_data is not covered by any pattern in allowed_domains={self.browser_profile.allowed_domains}\n' + f' This may be a security risk as credentials could be used on unintended domains.' + ) # Callbacks self.register_new_step_callback = register_new_step_callback @@ -427,7 +469,7 @@ class Agent(Generic[Context]): # Azure OpenAI API requires 'tools' parameter for GPT-4 # The error 'content must be either a string or an array' occurs when # the API expects a tools array but gets something else - if 'gpt-4' in self.model_name.lower(): + if 'gpt-4-' in self.model_name.lower(): return 'tools' else: return 'function_calling' @@ -454,8 +496,7 @@ class Agent(Generic[Context]): @time_execution_async('--step (agent)') async def step(self, step_info: AgentStepInfo | None = None) -> None: """Execute one step of the task""" - logger.info(f'๐Ÿ“ Step {self.state.n_steps}') - state = None + browser_state_summary = None model_output = None result: list[ActionResult] = [] step_start_time = time.time() @@ -465,6 +506,8 @@ class Agent(Generic[Context]): browser_state_summary = await self.browser_session.get_state_summary(cache_clickable_elements_hashes=True) current_page = await self.browser_session.get_current_page() + self._log_step_context(current_page, browser_state_summary) + # generate procedural memory if needed if self.enable_memory and self.memory and self.state.n_steps % self.memory.config.memory_interval == 0: self.memory.create_procedural_memory(self.state.n_steps) @@ -615,7 +658,7 @@ class Agent(Generic[Context]): if not result: return - if state: + if browser_state_summary: metadata = StepMetadata( step_number=self.state.n_steps, step_start_time=step_start_time, @@ -624,6 +667,9 @@ class Agent(Generic[Context]): ) self._make_history_item(model_output, browser_state_summary, result, metadata) + # Log step completion summary + self._log_step_completion_summary(step_start_time, result) + @time_execution_async('--handle_step_error (agent)') async def _handle_step_error(self, error: Exception) -> list[ActionResult]: """Handle all types of errors that can occur during a step""" @@ -719,7 +765,7 @@ class Agent(Generic[Context]): input_messages = self._convert_input_messages(input_messages) if self.tool_calling_method == 'raw': - logger.debug(f'Using {self.tool_calling_method} for {self.chat_model_library}') + self._log_llm_call_info(input_messages, self.tool_calling_method) try: output = self.llm.invoke(input_messages) response = {'raw': output, 'parsed': None} @@ -747,7 +793,7 @@ class Agent(Generic[Context]): raise LLMException(401, 'LLM API call failed') from e else: - logger.debug(f'Using {self.tool_calling_method} for {self.chat_model_library}') + self._log_llm_call_info(input_messages, self.tool_calling_method) structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True, method=self.tool_calling_method) response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore @@ -792,8 +838,9 @@ class Agent(Generic[Context]): parsed.action = parsed.action[: self.settings.max_actions_per_step] if not (hasattr(self.state, 'paused') and (self.state.paused or self.state.stopped)): - log_response(parsed) + log_response(parsed, self.controller.registry.registry) + self._log_next_action_summary(parsed) return parsed def _log_agent_run(self) -> None: @@ -802,6 +849,97 @@ class Agent(Generic[Context]): logger.debug(f'Version: {self.version}, Source: {self.source}') + def _log_step_context(self, current_page, browser_state_summary) -> None: + """Log step context information""" + url_short = current_page.url[:50] + '...' if len(current_page.url) > 50 else current_page.url + interactive_count = len(browser_state_summary.selector_map) if browser_state_summary else 0 + logger.info( + f'๐Ÿ“ Step {self.state.n_steps}: Evaluating page with {interactive_count} interactive elements on: {url_short}' + ) + + def _log_next_action_summary(self, parsed: 'AgentOutput') -> None: + """Log a comprehensive summary of the next action(s)""" + if not (logger.isEnabledFor(logging.DEBUG) and parsed.action): + return + + action_count = len(parsed.action) + + # Collect action details + action_details = [] + for i, action in enumerate(parsed.action): + action_data = action.model_dump(exclude_unset=True) + action_name = next(iter(action_data.keys())) if action_data else 'unknown' + action_params = action_data.get(action_name, {}) if action_data else {} + + # Format key parameters concisely + param_summary = [] + if isinstance(action_params, dict): + for key, value in action_params.items(): + if key == 'index': + param_summary.append(f'#{value}') + elif key == 'text' and isinstance(value, str): + text_preview = value[:30] + '...' if len(value) > 30 else value + param_summary.append(f'text="{text_preview}"') + elif key == 'url': + param_summary.append(f'url="{value}"') + elif key == 'success': + param_summary.append(f'success={value}') + elif isinstance(value, (str, int, bool)) and len(str(value)) < 20: + param_summary.append(f'{key}={value}') + + param_str = f'({", ".join(param_summary)})' if param_summary else '' + action_details.append(f'{action_name}{param_str}') + + # Create summary based on single vs multi-action + if action_count == 1: + logger.info(f'โšก๏ธ Decided next action: {action_details[0]}') + else: + summary_lines = [f'โšก๏ธ Decided next {action_count} multi-actions:'] + for i, detail in enumerate(action_details): + summary_lines.append(f' {i + 1}. {detail}') + logger.info('\n'.join(summary_lines)) + + def _log_step_completion_summary(self, step_start_time: float, result: list[ActionResult]) -> None: + """Log step completion summary with action count, timing, and success/failure stats""" + if not result: + return + + step_duration = time.time() - step_start_time + action_count = len(result) + + # Count success and failures + success_count = sum(1 for r in result if not r.error) + failure_count = action_count - success_count + + # Format success/failure indicators + success_indicator = f'โœ… {success_count}' if success_count > 0 else '' + failure_indicator = f'โŒ {failure_count}' if failure_count > 0 else '' + status_parts = [part for part in [success_indicator, failure_indicator] if part] + status_str = ' | '.join(status_parts) if status_parts else 'โœ… 0' + + logger.info(f'๐Ÿ“ Step {self.state.n_steps}: Ran {action_count} actions in {step_duration:.2f}s: {status_str}') + + def _log_llm_call_info(self, input_messages: list[BaseMessage], method: str) -> None: + """Log comprehensive information about the LLM call being made""" + # Count messages and check for images + message_count = len(input_messages) + total_chars = sum(len(str(msg.content)) for msg in input_messages) + has_images = any( + hasattr(msg, 'content') + and isinstance(msg.content, list) + and any(isinstance(item, dict) and item.get('type') == 'image_url' for item in msg.content) + for msg in input_messages + ) + current_tokens = getattr(self._message_manager.state.history, 'current_tokens', 0) + + # Determine output type + output_type = 'raw text output' if method == 'raw' else 'structured output + tools' + image_status = '๐Ÿ“ท images' if has_images else 'no images' + + logger.info( + f'๐Ÿง  LLM call: {self.chat_model_library} ({method}) | {message_count} msgs, ~{current_tokens} tokens, {total_chars} chars | {image_status} | {output_type}' + ) + def _log_agent_event(self, max_steps: int, agent_run_error: str | None = None) -> None: """Sent the agent event for this run to telemetry""" @@ -923,7 +1061,7 @@ class Agent(Generic[Context]): # Check control flags before each step if self.state.stopped: - logger.info('Agent stopped') + logger.info('๐Ÿ›‘ Agent stopped') agent_run_error = 'Agent stopped programmatically' break @@ -989,7 +1127,6 @@ class Agent(Generic[Context]): if not self._force_exit_telemetry_logged: # MODIFIED: Check the flag try: self._log_agent_event(max_steps=max_steps, agent_run_error=agent_run_error) - logger.info('Agent run telemetry logged.') except Exception as log_e: # Catch potential errors during logging itself logger.error(f'Failed to log telemetry event: {log_e}', exc_info=True) else: @@ -1075,7 +1212,10 @@ class Agent(Generic[Context]): results.append(result) - logger.debug(f'Executed action {i + 1} / {len(actions)}') + # Get action name from the action model + action_data = action.model_dump(exclude_unset=True) + action_name = next(iter(action_data.keys())) if action_data else 'unknown' + logger.info(f'โ˜‘๏ธ Executed action {i + 1}/{len(actions)}: {action_name}') if results[-1].is_done or results[-1].error or i == len(actions) - 1: break @@ -1140,14 +1280,13 @@ class Agent(Generic[Context]): async def log_completion(self) -> None: """Log the completion of the task""" - logger.info('โœ… Task completed') if self.state.history.is_successful(): - logger.info('โœ… Successfully') + logger.info('โœ… Task completed successfully') else: - logger.info('โŒ Unfinished') + logger.info('โŒ Task completed without success') total_tokens = self.state.history.total_input_tokens() - logger.info(f'๐Ÿ“ Total input tokens used (approximate): {total_tokens}') + logger.debug(f'๐Ÿ“ Total input tokens used (approximate): {total_tokens}') if self.register_done_callback: if inspect.iscoroutinefunction(self.register_done_callback): diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index cf06c5348..c6791982b 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -337,7 +337,7 @@ class BrowserContextArgs(BaseModel): proxy: ProxySettings | None = None permissions: list[str] = Field( default_factory=lambda: ['clipboard-read', 'clipboard-write', 'notifications'], - description='Browser permissions to grant.', + description='Browser permissions to grant (see playwright docs for valid permissions).', # clipboard is for google sheets and pyperclip automations # notifications are to avoid browser fingerprinting ) @@ -552,7 +552,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro # custom options we provide that aren't native playwright kwargs disable_security: bool = Field(default=False, description='Disable browser security features.') deterministic_rendering: bool = Field(default=False, description='Enable deterministic rendering flags.') - allowed_domains: list[str] | None = Field(default=None, description='List of allowed domains for navigation.') + allowed_domains: list[str] | None = Field( + default=None, + description='List of allowed domains for navigation e.g. ["*.google.com", "https://example.com", "chrome-extension://*"]', + ) keep_alive: bool | None = Field(default=None, description='Keep browser alive after agent run.') window_size: ViewportSize | None = Field( default=None, @@ -570,6 +573,8 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro ) # --- Page load/wait timings --- + default_navigation_timeout: float | None = Field(default=None, description='Default page navigation timeout.') + default_timeout: float | None = Field(default=None, description='Default playwright call timeout.') minimum_wait_page_load_time: float = Field(default=0.25, description='Minimum time to wait before capturing page state.') wait_for_network_idle_page_load_time: float = Field(default=0.5, description='Time to wait for network idle.') maximum_wait_page_load_time: float = Field(default=5.0, description='Maximum time to wait for page load.') diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index c2920e7fc..126ea26a0 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -11,6 +11,7 @@ from dataclasses import dataclass from functools import wraps from pathlib import Path from typing import Any, Self +from urllib.parse import urlparse import psutil from patchright.async_api import Playwright as PatchrightPlaywright @@ -29,7 +30,7 @@ from browser_use.browser.views import ( from browser_use.dom.clickable_element_processor.service import ClickableElementProcessor from browser_use.dom.service import DomService from browser_use.dom.views import DOMElementNode, SelectorMap -from browser_use.utils import time_execution_async, time_execution_sync +from browser_use.utils import match_url_with_domain_pattern, time_execution_async, time_execution_sync # Check if running in Docker IN_DOCKER = os.environ.get('IN_DOCKER', 'false').lower()[0] in 'ty1' @@ -40,7 +41,19 @@ logger = logging.getLogger('browser_use.browser.session') _GLOB_WARNING_SHOWN = False # used inside _is_url_allowed to avoid spamming the logs with the same warning multiple times -def truncate_url(s: str, max_len: int | None = None) -> str: +def _show_glob_warning(domain: str, glob: str): + global _GLOB_WARNING_SHOWN + if not _GLOB_WARNING_SHOWN: + logger.warning( + # glob patterns are very easy to mess up and match too many domains by accident + # e.g. if you only need to access gmail, don't use *.google.com because an attacker could convince the agent to visit a malicious doc + # on docs.google.com/s/some/evil/doc to set up a prompt injection attack + f"โš ๏ธ Allowing agent to visit {domain} based on allowed_domains=['{glob}', ...]. Set allowed_domains=['{domain}', ...] explicitly to avoid matching too many domains!" + ) + _GLOB_WARNING_SHOWN = True + + +def truncate_url(s: str, max_len: int | None = 22) -> str: """Truncate/pretty-print a URL with a maximum length, removing the protocol and www. prefix""" s = s.replace('https://', '').replace('http://', '').replace('www.', '') if max_len is not None and len(s) > max_len: @@ -48,6 +61,11 @@ def truncate_url(s: str, max_len: int | None = None) -> str: return s +def pretty_path(path: Path) -> str: + """Pretty-print a path, removing the drive letter on Windows""" + return str(path).replace(str(Path.home()), '~').replace(str(Path.cwd().resolve()), '.') + + def require_initialization(func): """decorator for BrowserSession methods to require the BrowserSession be already active""" @@ -106,7 +124,9 @@ class BrowserSession(BaseModel): browser_profile: InstanceOf[BrowserProfile] = Field( default=DEFAULT_BROWSER_PROFILE, description='BrowserProfile() instance containing config for the BrowserSession', - validation_alias=AliasChoices('profile', 'config', 'new_context_config'), # old names for this field, remove eventually + validation_alias=AliasChoices( + 'profile', 'config', 'new_context_config' + ), # abbreviations = 'profile', old deprecated names = 'config', 'new_context_config' ) # runtime props/state: these can be passed in as props at init, or get auto-setup by BrowserSession.start() @@ -118,8 +138,10 @@ class BrowserSession(BaseModel): default=None, description='CDP URL of the browser to connect to, e.g. http://localhost:9222 or ws://127.0.0.1:9222/devtools/browser/387adf4c-243f-4051-a181-46798f4a46f4', ) - chrome_pid: int | None = Field( - default=None, description='pid of the running chrome process to connect to on localhost (optional)' + browser_pid: int | None = Field( + default=None, + description='pid of a running chromium-based browser process to connect to on localhost', + validation_alias=AliasChoices('chrome_pid'), # old deprecated name = chrome_pid ) playwright: Playwright | PatchrightPlaywright | Playwright | None = Field( default=None, @@ -138,20 +160,20 @@ class BrowserSession(BaseModel): validation_alias=AliasChoices('playwright_browser_context', 'context'), exclude=True, ) + + # runtime state: state that changes during the lifecycle of a BrowserSession(), updated by the methods below initialized: bool = Field( default=False, - description='Skip BrowserSession launch/connection setup entirely if True (not recommended)', - validation_alias=AliasChoices('initialized', 'is_initialized'), + description='Mark BrowserSession launch/connection as already ready and skip setup (not recommended)', + validation_alias=AliasChoices('is_initialized'), ) - - # runtime state: internally tracked attrs updated by BrowserSession class methods agent_current_page: InstanceOf[Page] | None = Field( # mutated by self.create_new_tab(url) default=None, description='Foreground Page that the agent is focused on', - validation_alias=AliasChoices('current_page', 'page'), + validation_alias=AliasChoices('current_page', 'page'), # alias page= allows passing in a playwright Page object easily exclude=True, ) - human_current_page: InstanceOf[Page] | None = Field( # mutated by self.setup_foreground_tab_detection() + human_current_page: InstanceOf[Page] | None = Field( # mutated by self._setup_current_page_change_listeners() default=None, description='Foreground Page that the human is focused on', exclude=True, @@ -165,14 +187,15 @@ class BrowserSession(BaseModel): """Apply any extra **kwargs passed to BrowserSession(...) as config overrides on top of browser_profile""" session_own_fields = type(self).model_fields.keys() - # get all the extra BrowserProfile kwarg overrides passed to BrowserSession(...) that are not Fields on self - overrides = self.model_dump(exclude=session_own_fields) + # get all the extra kwarg overrides passed to BrowserSession(...) that are actually + # config Fields tracked by BrowserProfile, instead of BrowserSession's own args + profile_overrides = self.model_dump(exclude=set(session_own_fields)) # FOR REPL DEBUGGING ONLY, NEVER ALLOW CIRCULAR REFERENCES IN REAL CODE: # self.browser_profile._in_use_by_session = self # replace browser_profile with patched version - self.browser_profile = self.browser_profile.model_copy(update=overrides) + self.browser_profile = self.browser_profile.model_copy(update=profile_overrides) # FOR REPL DEBUGGING ONLY, NEVER ALLOW CIRCULAR REFERENCES IN REAL CODE: # self.browser_profile._in_use_by_session = self @@ -188,7 +211,19 @@ class BrowserSession(BaseModel): # return getattr(self.browser_profile, key) async def start(self) -> Self: - # finish initializing/validate the browser_profile: + """ + Starts the browser session by either connecting to an existing browser or launching a new one. + Precedence order for launching/connecting: + 1. page=Page playwright object, will use its page.context as browser_context + 2. browser_context=PlaywrightBrowserContext object, will use its browser + 3. browser=PlaywrightBrowser object, will use its first available context + 4. browser_pid=int, will connect to a local chromium-based browser via pid + 5. wss_url=str, will connect to a remote playwright browser server via WSS + 6. cdp_url=str, will connect to a remote chromium-based browser via CDP + 7. playwright=Playwright object, will use its chromium instance to launch a new browser + """ + + # apply last-minute runtime-computed options to the the browser_profile, validate profile, set up folders on disk assert isinstance(self.browser_profile, BrowserProfile) self.browser_profile.prepare_user_data_dir() # create/unlock the /SingletonLock self.browser_profile.detect_display_configuration() # adjusts config values, must come before launch/connect @@ -196,47 +231,54 @@ class BrowserSession(BaseModel): # launch/connect to the browser: # setup playwright library client, Browser, and BrowserContext objects await self.setup_playwright() - await self.setup_browser_connection() # connects to existing browser if available - await self.setup_browser_context() # creates a new context in existing browser or launches a new persistent context - assert self.browser_context + await self.setup_browser_via_passed_objects() + await self.setup_browser_via_browser_pid() + await self.setup_browser_via_wss_url() + await self.setup_browser_via_cdp_url() + await self.setup_new_browser_context() # creates a new context in existing browser or launches a new persistent context + assert self.browser_context, f'Failed to connect to or create a new BrowserContext for browser={self.browser}' # resize the existing pages and set up foreground tab detection - await self.setup_viewport_sizing() - await self.setup_foreground_tab_detection() + await self._setup_viewports() + await self._setup_current_page_change_listeners() self.initialized = True return self async def stop(self) -> None: - if not self.browser_profile.keep_alive: - logger.info('๐Ÿ›‘ Shutting down browser...') - if self.browser_context: - try: - await self.browser_context.close() - except Exception as e: - logger.debug(f'โŒ Error closing playwright BrowserContext {self.browser_context}: {type(e).__name__}: {e}') + """Shuts down the BrowserSession, killing the browser process if keep_alive=False""" - if self.browser: - try: - await self.browser.close() - except Exception as e: - logger.debug(f'โŒ Error closing playwright Browser {self.browser}: {type(e).__name__}: {e}') + if self.browser_profile.keep_alive: + return # nothing to do if keep_alive=True, leave the browser running - # kill the chrome subprocess if we were the ones that started it - if self.chrome_pid: - try: - psutil.Process(pid=self.chrome_pid).terminate() - except Exception as e: - if 'NoSuchProcess' not in type(e).__name__: - logger.debug(f'โŒ Error terminating chrome subprocess pid={self.chrome_pid}: {type(e).__name__}: {e}') + logger.info('๐Ÿ›‘ Shutting down browser...') + if self.browser_context: + try: + await self.browser_context.close() + except Exception as e: + logger.debug(f'โŒ Error closing playwright BrowserContext {self.browser_context}: {type(e).__name__}: {e}') + + if self.browser: + try: + await self.browser.close() + except Exception as e: + logger.debug(f'โŒ Error closing playwright Browser {self.browser}: {type(e).__name__}: {e}') + + # kill the chrome subprocess if we were the ones that started it + if self.browser_pid: + try: + psutil.Process(pid=self.browser_pid).terminate() + except Exception as e: + if 'NoSuchProcess' not in type(e).__name__: + logger.debug(f'โŒ Error terminating chrome subprocess pid={self.browser_pid}: {type(e).__name__}: {e}') async def close(self) -> None: - """Shortcut for self.stop()""" + """Deprecated: Provides backwards-compatibility with old class method Browser().close()""" await self.stop() async def new_context(self, **kwargs): - """Create a new browser context with the given kwargs""" + """Deprecated: Provides backwards-compatibility with old class method Browser().new_context()""" return self async def __aenter__(self) -> BrowserSession: @@ -247,65 +289,87 @@ class BrowserSession(BaseModel): await self.stop() async def setup_playwright(self) -> None: - """Override to customize the set up of the playwright or patchright library object""" - self.playwright = self.playwright or await async_playwright().start() + """ + Set up playwright library client object: usually the result of (await async_playwright().start()) + Override to customize the set up of the playwright or patchright library object + """ + self.playwright = self.playwright or (await async_playwright().start()) # if isinstance(self.playwright, PatchrightPlaywright): # # patchright handles all its own default args, dont mess with them # self.browser_profile.ignore_default_args = True - return self.playwright + # return self.playwright - async def setup_browser_connection(self) -> None: + async def setup_browser_via_passed_objects(self) -> None: """Override to customize the set up of the connection to an existing browser""" - # if process is provided, calcuclate its CDP URL by looking for --remote-debugging-port=... in the launch args - if self.chrome_pid: - chrome_process = psutil.Process(pid=self.chrome_pid) - assert chrome_process.is_running(), 'Chrome process is not running' - args = chrome_process.cmdline() - debug_port = next((arg for arg in args if arg.startswith('--remote-debugging-port=')), '').split('=')[-1].strip() - assert debug_port, ( - f'Could not connect because could not find --remote-debugging-port=... in chrome launch args: pid={self.chrome_pid} {args}' - ) - # we could automatically relaunch the browser process with that arg added here, but they may have tabs open they dont want to lose - self.cdp_url = self.cdp_url or f'http://localhost:{debug_port}/' - logger.info(f'๐ŸŒŽ Connecting to existing chromium process: pid={self.chrome_pid} on {self.cdp_url}') + # 1. check for a passed Page object, if present, it always takes priority, set browser_context = page.context + self.browser_context = (self.agent_current_page and self.agent_current_page.context) or self.browser_context or None - if self.wss_url: - logger.info(f'๐ŸŒŽ Connecting to remote chromium playwright node.js server over WSS: {self.wss_url}') - self.browser = self.browser or await self.playwright.chromium.connect( - self.wss_url, - **self.browser_profile.kwargs_for_connect().model_dump(), - ) - # dont default to closing the browser when the BrowserSession is over if we connect by WSS - if self.browser_profile.keep_alive is None: - self.browser_profile.keep_alive = True - elif self.cdp_url: - logger.info(f'๐ŸŒŽ Connecting to remote chromium browser over CDP: {self.cdp_url}') - self.browser = self.browser or await self.playwright.chromium.connect_over_cdp( - self.cdp_url, - **self.browser_profile.kwargs_for_connect().model_dump(), - ) - # dont default to closing the browser when the BrowserSession is over if we connect by CDP - if self.browser_profile.keep_alive is None: - self.browser_profile.keep_alive = True + # 2. if we have a context now, it always takes precedence, set browser = context.browser, otherwise use the passed browser + self.browser = (self.browser_context and self.browser_context.browser) or self.browser or None - # self.browser may still be None at this point if we have no config implying we should connect to an existing browser - # self.setup_browser_context() will be called next and if it finds self.browser is None, it will - # launch a new browser+context all in one go using launch_persistent_context() + if self.browser or self.browser_context: + logger.info(f'๐ŸŒŽ Connected to existing user-provided browser_context: {self.browser_context}') + self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end - return self.browser + async def setup_browser_via_browser_pid(self) -> None: + """if browser_pid is provided, calcuclate its CDP URL by looking for --remote-debugging-port=... in its CLI args, then connect to it""" - async def setup_browser_context(self) -> None: - # if we have a browser_context but no browser, use the browser from the context - if self.browser_context: - logger.info(f'๐ŸŒŽ Using existing user-provided browser_context and browser: {self.browser_context}') - self.browser = self.browser or self.browser_context.browser - # dont default to closing the browser when the BrowserSession is over if we are passed an external browser - if self.browser_profile.keep_alive is None: - self.browser_profile.keep_alive = True + if self.browser or self.browser_context: + return # already connected to a browser + if not self.browser_pid: + return # no browser_pid provided, nothing to do + chrome_process = psutil.Process(pid=self.browser_pid) + assert chrome_process.is_running(), 'Chrome process is not running' + args = chrome_process.cmdline() + debug_port = next((arg for arg in args if arg.startswith('--remote-debugging-port=')), '').split('=')[-1].strip() + assert debug_port, ( + f'Could not find --remote-debugging-port=... to connect to in browser launch args: browser_pid={self.browser_pid} {args}' + ) + # we could automatically relaunch the browser process with that arg added here, but they may have tabs open they dont want to lose + self.cdp_url = self.cdp_url or f'http://localhost:{debug_port}/' + logger.info(f'๐ŸŒŽ Connecting to existing local browser process: browser_pid={self.browser_pid} on {self.cdp_url}') + self.browser = self.browser or await self.playwright.chromium.connect_over_cdp( + self.cdp_url, + **self.browser_profile.kwargs_for_connect().model_dump(), + ) + self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end + + async def setup_browser_via_wss_url(self) -> None: + """check for a passed wss_url, connect to a remote playwright browser server via WSS""" + + if self.browser or self.browser_context: + return # already connected to a browser + if not self.wss_url: + return # no wss_url provided, nothing to do + + logger.info(f'๐ŸŒŽ Connecting to existing remote chromium playwright node.js server over WSS: {self.wss_url}') + self.browser = self.browser or await self.playwright.chromium.connect( + self.wss_url, + **self.browser_profile.kwargs_for_connect().model_dump(), + ) + self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end + + async def setup_browser_via_cdp_url(self) -> None: + """check for a passed cdp_url, connect to a remote chromium-based browser via CDP""" + + if self.browser or self.browser_context: + return # already connected to a browser + if not self.cdp_url: + return # no cdp_url provided, nothing to do + + logger.info(f'๐ŸŒŽ Connecting to existing remote chromium-based browser over CDP: {self.cdp_url}') + self.browser = self.browser or await self.playwright.chromium.connect_over_cdp( + self.cdp_url, + **self.browser_profile.kwargs_for_connect().model_dump(), + ) + self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end + + async def setup_new_browser_context(self) -> None: + """Launch a new browser and browser_context""" current_process = psutil.Process(os.getpid()) child_pids_before_launch = {child.pid for child in current_process.children(recursive=True)} @@ -313,7 +377,7 @@ class BrowserSession(BaseModel): if self.browser and not self.browser_context: if self.browser.contexts: self.browser_context = self.browser.contexts[0] - logger.info(f'๐ŸŒŽ Using first browser_context available in user-provided browser: {self.browser_context}') + logger.info(f'๐ŸŒŽ Using first browser_context available in existing browser: {self.browser_context}') else: self.browser_context = await self.browser.new_context( **self.browser_profile.kwargs_for_new_context().model_dump() @@ -328,7 +392,9 @@ class BrowserSession(BaseModel): # if we still have no browser_context by now, launch a new local one using launch_persistent_context() if not self.browser_context: logger.info( - f'๐ŸŒŽ Launching local {str(type(self.playwright).__module__).split(".")[0]} {self.browser_profile.channel.name.lower()} browser with user_data_dir={self.browser_profile.user_data_dir or "None (incognito)"}' + f'๐ŸŒŽ Launching local browser ' + f'driver={str(type(self.playwright).__module__).split(".")[0]} channel={self.browser_profile.channel.name.lower()} ' + f'user_data_dir={pretty_path(self.browser_profile.user_data_dir) if self.browser_profile.user_data_dir else "None (incognito)"}' ) if not self.browser_profile.user_data_dir: # if no user_data_dir is provided, launch an incognito context with no persistent user_data_dir @@ -339,70 +405,79 @@ class BrowserSession(BaseModel): **self.browser_profile.kwargs_for_new_context().model_dump() ) else: + # user data dir was provided, prepare it for use self.browser_profile.prepare_user_data_dir() # search for potentially conflicting local processes running on the same user_data_dir for proc in psutil.process_iter(['pid', 'cmdline']): if f'--user-data-dir={self.browser_profile.user_data_dir}' in (proc.info['cmdline'] or []): - # suffix_num = str(self.browser_profile.user_data_dir).rsplit('.', 1)[-1] or '1' - # suffix_num = int(suffix_num) if suffix_num.isdigit() else 1 - - # dir_name = self.browser_profile.user_data_dir.name - # incremented_name = dir_name.replace(f'.{suffix_num}', f'.{suffix_num + 1}') - # fork_path = self.browser_profile.user_data_dir.parent / incremented_name - - # # keep incrementing the suffix_num until we find a path that doesn't exist - # while fork_path.exists(): - # suffix_num += 1 - # fork_path = self.browser_profile.user_data_dir.parent / ( - # dir_name.rsplit('.', 1)[0] + f'.{suffix_num}' - # ) - logger.warning( - f'๐Ÿšจ Found potentially conflicting Chrome process pid={proc.info["pid"]} already running with the same user_data_dir={self.browser_profile.user_data_dir}' + f'๐Ÿšจ Found potentially conflicting browser process browser_pid={proc.info["pid"]} ' + f'already running with the same user_data_dir={pretty_path(self.browser_profile.user_data_dir)}' ) - # use shutil to recursively copy the user_data_dir to a new location - # shutil.copytree( - # str(self.browser_profile.user_data_dir), - # str(fork_path), - # symlinks=True, - # ignore_dangling_symlinks=True, - # dirs_exist_ok=False, - # ) - # self.browser_profile.user_data_dir = fork_path - # self.browser_profile.prepare_user_data_dir() + # self._fork_locked_user_data_dir() break # if a user_data_dir is provided, launch a persistent context with that user_data_dir self.browser_context = await self.playwright.chromium.launch_persistent_context( **self.browser_profile.kwargs_for_launch_persistent_context().model_dump() ) - self.browser = self.browser_context.browser or self.browser - # ^ this can unfortunately be None ^ playwright does not give us a browser object when we use launch_persistent_context() + + self.browser = (self.browser_context and self.browser_context.browser) or self.browser + # ^ this can unfortunately still be None at the end ^ + # playwright does not give us a browser object at all when we use launch_persistent_context()! # Detect any new child chrome processes that we might have launched above child_pids_after_launch = {child.pid for child in current_process.children(recursive=True)} new_child_pids = child_pids_after_launch - child_pids_before_launch new_child_procs = [psutil.Process(pid) for pid in new_child_pids] new_chrome_procs = [proc for proc in new_child_procs if 'Helper' not in proc.name() and proc.status() == 'running'] - if new_chrome_procs and not self.chrome_pid: - self.chrome_pid = new_chrome_procs[0].pid - logger.debug(f' โ†ณ Spawned chrome subprocess: pid={self.chrome_pid} {" ".join(new_chrome_procs[0].cmdline())}') - # default to closing the browser ourselves when the BrowserSession is over if we launched it ourselves - if self.browser_profile.keep_alive is None: - self.browser_profile.keep_alive = False + if new_chrome_procs and not self.browser_pid: + self.browser_pid = new_chrome_procs[0].pid + logger.debug( + f' โ†ณ Spawned browser subprocess: browser_pid={self.browser_pid} {" ".join(new_chrome_procs[0].cmdline())}' + ) + self._set_browser_keep_alive(False) # close the browser at the end because we launched it if self.browser: - connection_method = 'CDP' if self.cdp_url else 'WSS' if self.wss_url else 'Local' + connection_method = 'WSS' if self.wss_url else 'CDP' if (self.cdp_url and not self.browser_pid) else 'Local' assert self.browser.is_connected(), ( f'Browser is not connected, did the browser process crash or get killed? (connection method: {connection_method})' ) - logger.debug(f'๐ŸŒŽ {connection_method} Browser connected: v{self.browser.version}') - assert self.browser_context, f'BrowserContext {self.browser_context} is not set up' + logger.debug( + f'๐ŸŒŽ {connection_method} browser connected: v{self.browser.version} {self.cdp_url or self.wss_url or self.browser_profile.executable_path or "(playwright)"}' + ) - return self.browser_context + assert self.browser_context, ( + f'Failed to create a playwright BrowserContext {self.browser_context} for browser={self.browser}' + ) - async def setup_foreground_tab_detection(self) -> None: + # async def _fork_locked_user_data_dir(self) -> None: + # """Fork an in-use user_data_dir by cloning it to a new location to allow a second browser to use it""" + # # TODO: implement copy-on-write using overlayfs or zfs or something + # suffix_num = str(self.browser_profile.user_data_dir).rsplit('.', 1)[-1] or '1' + # suffix_num = int(suffix_num) if suffix_num.isdigit() else 1 + # dir_name = self.browser_profile.user_data_dir.name + # incremented_name = dir_name.replace(f'.{suffix_num}', f'.{suffix_num + 1}') + # fork_path = self.browser_profile.user_data_dir.parent / incremented_name + + # # keep incrementing the suffix_num until we find a path that doesn't exist + # while fork_path.exists(): + # suffix_num += 1 + # fork_path = self.browser_profile.user_data_dir.parent / (dir_name.rsplit('.', 1)[0] + f'.{suffix_num}') + + # # use shutil to recursively copy the user_data_dir to a new location + # shutil.copytree( + # str(self.browser_profile.user_data_dir), + # str(fork_path), + # symlinks=True, + # ignore_dangling_symlinks=True, + # dirs_exist_ok=False, + # ) + # self.browser_profile.user_data_dir = fork_path + # self.browser_profile.prepare_user_data_dir() + + async def _setup_current_page_change_listeners(self) -> None: # Uses a combination of: # - visibilitychange events # - window focus/blur events @@ -419,38 +494,45 @@ class BrowserSession(BaseModel): # - https://github.com/microsoft/playwright/issues/13989 # set up / detect foreground page + assert self.browser_context is not None, 'BrowserContext object is not set' pages = self.browser_context.pages foreground_page = None if pages: foreground_page = pages[0] logger.debug( - f'๐Ÿ“œ Found {len(pages)} existing pages in browser, agent will start focused on Tab [{pages.index(foreground_page)}]: {foreground_page.url}' + f'๐Ÿ“œ Found {len(pages)} existing tabs in browser, agent will start focused on Tab [{pages.index(foreground_page)}]: {foreground_page.url}' ) else: foreground_page = await self.browser_context.new_page() pages = [foreground_page] - logger.debug('๐Ÿ“„ Opened new page in empty fresh browser context...') + logger.debug('โž• Opened new tab in empty browser context...') self.agent_current_page = self.agent_current_page or foreground_page self.human_current_page = self.human_current_page or foreground_page - def _BrowserUseonTabVisibilityChange(source): + def _BrowserUseonTabVisibilityChange(source: dict[str, str]): + """hook callback fired when init script injected into a page detects a focus event""" new_page = source['page'] # Update human foreground tab state old_foreground = self.human_current_page + assert self.browser_context is not None, 'BrowserContext object is not set' + assert old_foreground is not None, 'Old foreground page is not set' old_tab_idx = self.browser_context.pages.index(old_foreground) self.human_current_page = new_page new_tab_idx = self.browser_context.pages.index(new_page) # Log before and after for debugging - if old_foreground.url != new_page.url: + old_url = old_foreground and old_foreground.url or 'about:blank' + new_url = new_page and new_page.url or 'about:blank' + agent_url = self.agent_current_page and self.agent_current_page.url or 'about:blank' + agent_tab_idx = self.browser_context.pages.index(self.agent_current_page) + if old_url != new_url: logger.info( - f'๐Ÿ‘๏ธ Foregound tab changed by human from [{old_tab_idx}]{truncate_url(old_foreground.url, 22) if old_foreground else "about:blank"} ' - f'โžก๏ธ [{new_tab_idx}]{truncate_url(new_page.url, 22)} ' - f'(agent will stay on [{self.browser_context.pages.index(self.agent_current_page)}]{truncate_url(self.agent_current_page.url, 22)})' + f'๐Ÿ‘๏ธ Foregound tab changed by human from [{old_tab_idx}]{truncate_url(old_url)} ' + f'โžก๏ธ [{new_tab_idx}]{truncate_url(new_url)} ' + f'(agent will stay on [{agent_tab_idx}]{truncate_url(agent_url)})' ) - return new_page.url await self.browser_context.expose_binding('_BrowserUseonTabVisibilityChange', _BrowserUseonTabVisibilityChange) update_tab_focus_script = """ @@ -489,18 +571,14 @@ class BrowserSession(BaseModel): await page.evaluate(update_tab_focus_script) # logger.debug(f'๐Ÿ‘๏ธ Added visibility listener to existing tab: {page.url}') - async def setup_viewport_sizing(self) -> None: + async def _setup_viewports(self) -> None: """Resize any existing page viewports to match the configured size""" - if not self.browser_context.pages: - return - - # First, set the viewport size on any existing pages + # log the viewport settings to terminal viewport = self.browser_profile.viewport logger.debug( - '๐Ÿ“ Setting up viewport options: ' + '๐Ÿ“ Setting up viewport: ' + f'headless={self.browser_profile.headless} ' - + (f'viewport={viewport["width"]}x{viewport["height"]}px ' if viewport else '(no viewport) ') + ( f'window={self.browser_profile.window_size["width"]}x{self.browser_profile.window_size["height"]}px ' if self.browser_profile.window_size @@ -511,17 +589,64 @@ class BrowserSession(BaseModel): if self.browser_profile.screen else '' ) - + f'is_mobile={self.browser_profile.is_mobile} ' + + (f'viewport={viewport["width"]}x{viewport["height"]}px ' if viewport else '(no viewport) ') + f'device_scale_factor={self.browser_profile.device_scale_factor or 1.0} ' + + f'is_mobile={self.browser_profile.is_mobile} ' + (f'color_scheme={self.browser_profile.color_scheme.value} ' if self.browser_profile.color_scheme else '') + (f'locale={self.browser_profile.locale} ' if self.browser_profile.locale else '') + (f'timezone_id={self.browser_profile.timezone_id} ' if self.browser_profile.timezone_id else '') + (f'geolocation={self.browser_profile.geolocation} ' if self.browser_profile.geolocation else '') ) - if viewport: - for page in self.browser_context.pages: + + # if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults + if self.browser_profile.permissions: + try: + await self.browser_context.grant_permissions(self.browser_profile.permissions) + except Exception as e: + logger.warning( + f'โš ๏ธ Failed to grant browser permissions {self.browser_profile.permissions}: {type(e).__name__}: {e}' + ) + try: + if self.browser_profile.default_timeout: + await self.browser_context.set_default_timeout(self.browser_profile.default_timeout) + if self.browser_profile.default_navigation_timeout: + await self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout) + except Exception as e: + logger.warning( + f'โš ๏ธ Failed to set playwright timeout settings ' + f'calls={self.browser_profile.default_timeout} ' + f'nav={self.browser_profile.default_navigation_timeout}: {type(e).__name__}: {e}' + ) + try: + if self.browser_profile.extra_http_headers: + await self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers) + except Exception as e: + logger.warning(f'โš ๏ธ Failed to setup playwright extra_http_headers: {type(e).__name__}: {e}') + + try: + if self.browser_profile.geolocation: + await self.browser_context.set_geolocation(self.browser_profile.geolocation) + except Exception as e: + logger.warning(f'โš ๏ธ Failed to update browser geolocation {self.browser_profile.geolocation}: {type(e).__name__}: {e}') + + # if self.storage_state: + # TODO: implement applying self.stroage_state to an existing browser_context + # await self.browser_context.set_storage_state(self.storage_state) + + for page in self.browser_context.pages: + # apply viewport size settings to any existing pages + if viewport: await page.set_viewport_size(viewport) + # show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages + if page.url == 'about:blank': + await self._show_dvd_screensaver_loading_animation(page) + + def _set_browser_keep_alive(self, keep_alive: bool | None) -> None: + """set the keep_alive flag on the browser_profile, defaulting to True if keep_alive is None""" + if self.browser_profile.keep_alive is None: + self.browser_profile.keep_alive = keep_alive + # --- Tab management --- async def get_current_page(self) -> Page: """Get the current page + ensure it's not None / closed""" @@ -541,8 +666,8 @@ class BrowserSession(BaseModel): self.agent_current_page = self.agent_current_page or self.human_current_page or None self.human_current_page = self.human_current_page or self.agent_current_page or None + # if both are still None, fallback to using the first open tab we can find if self.agent_current_page is None: - # if both are still None, fallback to using the first open tab we can find if self.browser_context.pages: first_available_tab = self.browser_context.pages[0] self.agent_current_page = first_available_tab @@ -553,8 +678,8 @@ class BrowserSession(BaseModel): self.agent_current_page = new_tab self.human_current_page = new_tab - assert self.agent_current_page is not None - assert self.human_current_page is not None + assert self.agent_current_page is not None, 'Failed to find or create a new page for the agent' + assert self.human_current_page is not None, 'Failed to find or create a new page for the human' return self.agent_current_page @@ -621,61 +746,6 @@ class BrowserSession(BaseModel): selector_map = await self.get_selector_map() return selector_map.get(index) - @time_execution_async('--input_text_element_node') - async def _input_text_element_node(self, element_node: DOMElementNode, text: str): - """ - Input text into an element with proper error handling and state management. - Handles different types of input fields and ensures proper element state before input. - """ - try: - # Highlight before typing - # if element_node.highlight_index is not None: - # await self._update_state(focus_element=element_node.highlight_index) - - element_handle = await self.get_locate_element(element_node) - - if element_handle is None: - raise BrowserError(f'Element: {repr(element_node)} not found') - - # Ensure element is ready for input - try: - await element_handle.wait_for_element_state('stable', timeout=1000) - is_visible = await self._is_visible(element_handle) - if is_visible: - await element_handle.scroll_into_view_if_needed(timeout=1000) - except Exception: - pass - - # Get element properties to determine input method - tag_handle = await element_handle.get_property('tagName') - tag_name = (await tag_handle.json_value()).lower() - is_contenteditable = await element_handle.get_property('isContentEditable') - readonly_handle = await element_handle.get_property('readOnly') - disabled_handle = await element_handle.get_property('disabled') - - readonly = await readonly_handle.json_value() if readonly_handle else False - disabled = await disabled_handle.json_value() if disabled_handle else False - - # always click the element first to make sure it's in the focus - await element_handle.click() - await asyncio.sleep(0.1) - - try: - if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled): - await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}') - await element_handle.type(text, delay=5) - else: - await element_handle.fill(text) - except Exception: - # last resort fallback, assume it's already focused after we clicked on it, - # just simulate keypresses on the entire page - page = await self.get_current_page() - await page.keyboard.type(text) - - except Exception as e: - logger.debug(f'โŒ Failed to input text into element: {repr(element_node)}. Error: {str(e)}') - raise BrowserError(f'Failed to input text into index {element_node.highlight_index}') - @time_execution_async('--click_element_node') async def _click_element_node(self, element_node: DOMElementNode) -> str | None: """ @@ -973,6 +1043,7 @@ class BrowserSession(BaseModel): page.on('request', on_request) page.on('response', on_response) + now = asyncio.get_event_loop().time() try: # Wait for idle time start_time = asyncio.get_event_loop().time() @@ -996,7 +1067,9 @@ class BrowserSession(BaseModel): page.remove_listener('request', on_request) page.remove_listener('response', on_response) - logger.debug(f'โš–๏ธ Network stabilized for {self.browser_profile.wait_for_network_idle_page_load_time} seconds') + elapsed = now - start_time + if elapsed > 1: + logger.debug(f'๐Ÿ’ค Page network traffic calmed down after {now - start_time:.2f} seconds') async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None): """ @@ -1024,7 +1097,32 @@ class BrowserSession(BaseModel): elapsed = time.time() - start_time remaining = max((timeout_overwrite or self.browser_profile.minimum_wait_page_load_time) - elapsed, 0) - logger.debug(f'--Page loaded in {elapsed:.2f} seconds, waiting for additional {remaining:.2f} seconds') + # just for logging, calculate how much data was downloaded + try: + bytes_used = await page.evaluate(""" + () => { + let total = 0; + for (const entry of performance.getEntriesByType('resource')) { + total += entry.transferSize || 0; + } + for (const nav of performance.getEntriesByType('navigation')) { + total += nav.transferSize || 0; + } + return total; + } + """) + except Exception: + bytes_used = None + + tab_idx = self.tabs.index(page) + if bytes_used is not None: + logger.debug( + f'โžก๏ธ Page navigation [{tab_idx}]{truncate_url(page.url, 40)} used {bytes_used / 1024:.1f} KB in {elapsed:.2f}s, waiting +{remaining:.2f}s for all frames to finish' + ) + else: + logger.debug( + f'โžก๏ธ Page navigation [{tab_idx}]{truncate_url(page.url, 40)} took {elapsed:.2f}s, waiting +{remaining:.2f}s for all frames to finish' + ) # Sleep remaining time if needed if remaining > 0: @@ -1032,70 +1130,37 @@ class BrowserSession(BaseModel): def _is_url_allowed(self, url: str) -> bool: """ - Check if a URL is allowed based on the whitelist configuration. + Check if a URL is allowed based on the whitelist configuration. SECURITY CRITICAL. - Supports glob patterns in allowed_domains: + Supports optional glob patterns and schemes in allowed_domains: - *.example.com will match sub.example.com and example.com - *google.com will match google.com, agoogle.com, and www.google.com + - http*://example.com will match http://example.com, https://example.com + - chrome-extension://* will match chrome-extension://aaaaaaaaaaaa and chrome-extension://bbbbbbbbbbbbb """ if not self.browser_profile.allowed_domains: + return True # allowed_domains are not configured, allow everything by default + + # Special case: Always allow 'about:blank' new tab page + if url == 'about:blank': return True - def _show_glob_warning(domain: str, glob: str): - global _GLOB_WARNING_SHOWN - if not _GLOB_WARNING_SHOWN: - logger.warning( - # glob patterns are very easy to mess up and match too many domains by accident - # e.g. if you only need to access gmail, don't use *.google.com because an attacker could convince the agent to visit a malicious doc - # on docs.google.com/s/some/evil/doc to set up a prompt injection attack - f"โš ๏ธ Allowing agent to visit {domain} based on allowed_domains=['{glob}', ...]. Set allowed_domains=['{domain}', ...] explicitly to avoid matching too many domains!" - ) - _GLOB_WARNING_SHOWN = True + for allowed_domain in self.browser_profile.allowed_domains: + try: + if match_url_with_domain_pattern(url, allowed_domain, log_warnings=True): + # If it's a pattern with wildcards, show a warning + if '*' in allowed_domain: + parsed_url = urlparse(url) + domain = parsed_url.hostname.lower() if parsed_url.hostname else '' + _show_glob_warning(domain, allowed_domain) + return True + except AssertionError: + # This would only happen if about:blank is passed to match_url_with_domain_pattern, + # which shouldn't occur since we check for it above + continue - try: - import fnmatch - from urllib.parse import urlparse - - parsed_url = urlparse(url) - - # Special case: Allow 'about:blank' explicitly - if url == 'about:blank' or parsed_url.scheme.lower() in ('chrome', 'brave', 'edge', 'chrome-extension'): - return True - - # Extract only the hostname component (without auth credentials or port) - # Hostname returns only the domain portion, ignoring username:password and port - domain = parsed_url.hostname.lower() if parsed_url.hostname else '' - - if not domain: - return False - - for allowed_domain in self.browser_profile.allowed_domains: - allowed_domain = allowed_domain.lower() - - # Handle glob patterns - if '*' in allowed_domain: - # Special handling for *.domain.tld pattern to also match the bare domain - if allowed_domain.startswith('*.'): - # If pattern is *.example.com, also allow example.com (without subdomain) - parent_domain = allowed_domain[2:] # Remove the '*.' prefix - if domain == parent_domain or fnmatch.fnmatch(domain, allowed_domain): - _show_glob_warning(domain, allowed_domain) - return True - else: - # For other glob patterns like *google.com - if fnmatch.fnmatch(domain, allowed_domain): - _show_glob_warning(domain, allowed_domain) - return True - else: - # Standard matching (exact or subdomain) - if domain == allowed_domain: - return True - - return False - except Exception as e: - logger.error(f'โ›”๏ธ Error checking URL allowlist: {type(e).__name__}: {e}') - return False + return False async def _check_and_handle_navigation(self, page: Page) -> None: """Check if current page URL is allowed and handle if not.""" @@ -1151,6 +1216,8 @@ class BrowserSession(BaseModel): not necessarily the tab that is visible to the user (human_current_page). If they are the same tab, both references will be updated. """ + assert self.browser_context is not None, 'Browser context is not set' + assert self.agent_current_page is not None, 'Agent current page is not set' # Check if this is the foreground tab as well is_foreground = self.agent_current_page == self.human_current_page @@ -1369,6 +1436,7 @@ class BrowserSession(BaseModel): """ Returns a base64 encoded screenshot of the current page. """ + assert self.agent_current_page is not None, 'Agent current page is not set' # We no longer force tabs to the foreground as it disrupts user focus # await self.agent_current_page.bring_to_front() @@ -1391,8 +1459,19 @@ class BrowserSession(BaseModel): # region - User Actions - @classmethod - def _convert_simple_xpath_to_css_selector(cls, xpath: str) -> str: + @staticmethod + async def _get_unique_filename(directory: str, filename: str) -> str: + """Generate a unique filename for downloads by appending (1), (2), etc., if a file already exists.""" + base, ext = os.path.splitext(filename) + counter = 1 + new_filename = filename + while os.path.exists(os.path.join(directory, new_filename)): + new_filename = f'{base} ({counter}){ext}' + counter += 1 + return new_filename + + @staticmethod + def _convert_simple_xpath_to_css_selector(xpath: str) -> str: """Converts simple XPath expressions to CSS selectors.""" if not xpath: return '' @@ -1558,6 +1637,7 @@ class BrowserSession(BaseModel): tag_name = element.tag_name or '*' return f"{tag_name}[highlight_index='{element.highlight_index}']" + @require_initialization @time_execution_async('--is_visible') async def _is_visible(self, element: ElementHandle) -> bool: """ @@ -1573,6 +1653,7 @@ class BrowserSession(BaseModel): return not is_hidden and bbox is not None and bbox['width'] > 0 and bbox['height'] > 0 + @require_initialization @time_execution_async('--get_locate_element') async def get_locate_element(self, element: DOMElementNode) -> ElementHandle | None: page = await self.get_current_page() @@ -1619,6 +1700,7 @@ class BrowserSession(BaseModel): logger.error(f'โŒ Failed to locate element: {str(e)}') return None + @require_initialization @time_execution_async('--get_locate_element_by_xpath') async def get_locate_element_by_xpath(self, xpath: str) -> ElementHandle | None: """ @@ -1639,6 +1721,7 @@ class BrowserSession(BaseModel): logger.error(f'โŒ Failed to locate element by XPath {xpath}: {str(e)}') return None + @require_initialization @time_execution_async('--get_locate_element_by_css_selector') async def get_locate_element_by_css_selector(self, css_selector: str) -> ElementHandle | None: """ @@ -1659,6 +1742,7 @@ class BrowserSession(BaseModel): logger.error(f'โŒ Failed to locate element by CSS selector {css_selector}: {str(e)}') return None + @require_initialization @time_execution_async('--get_locate_element_by_text') async def get_locate_element_by_text( self, text: str, nth: int | None = 0, element_type: str | None = None @@ -1757,6 +1841,7 @@ class BrowserSession(BaseModel): @time_execution_async('--switch_to_tab') async def switch_to_tab(self, page_id: int) -> Page: """Switch to a specific tab by its page_id (aka tab index exposed to LLM)""" + assert self.browser_context is not None, 'Browser context is not set' pages = self.browser_context.pages if page_id >= len(pages): @@ -1810,10 +1895,18 @@ class BrowserSession(BaseModel): assert self.human_current_page is not None assert self.agent_current_page is not None - if url: - assert self.agent_current_page.url == url - else: - assert self.agent_current_page.url == 'about:blank' + # if url: # sometimes this does not pass because JS or HTTP redirects the page really fast + # assert self.agent_current_page.url == url + # else: + # assert self.agent_current_page.url == 'about:blank' + + # if there are any unused about:blank tabs after we open a new tab, close them to clean up unused tabs + for page in self.browser_context.pages: + if page.url == 'about:blank' and page != self.agent_current_page: + await page.close() + self.human_current_page = ( # in case we just closed the human's tab, fix the refs + self.human_current_page if not self.human_current_page.is_closed() else self.agent_current_page + ) return new_page @@ -1859,6 +1952,7 @@ class BrowserSession(BaseModel): return False + @require_initialization async def get_scroll_info(self, page: Page) -> tuple[int, int]: """Get scroll position information for the current page.""" scroll_y = await page.evaluate('window.scrollY') @@ -1868,6 +1962,7 @@ class BrowserSession(BaseModel): pixels_below = total_height - (scroll_y + viewport_height) return pixels_above, pixels_below + @require_initialization async def _scroll_container(self, pixels: int) -> None: """Scroll the element that truly owns vertical scroll.Starts at the focused node โžœ climbs to the first big, scroll-enabled ancestor otherwise picks the first scrollable element or the root, then calls `element.scrollBy` (or `window.scrollBy` for the root) by the supplied pixel value.""" @@ -1900,13 +1995,95 @@ class BrowserSession(BaseModel): page = await self.get_current_page() await page.evaluate(SMART_SCROLL_JS, pixels) - @staticmethod - async def _get_unique_filename(directory, filename): - """Generate a unique filename by appending (1), (2), etc., if a file already exists.""" - base, ext = os.path.splitext(filename) - counter = 1 - new_filename = filename - while os.path.exists(os.path.join(directory, new_filename)): - new_filename = f'{base} ({counter}){ext}' - counter += 1 - return new_filename + # --- DVD Screensaver Loading Animation Helper --- + async def _show_dvd_screensaver_loading_animation(self, page: Page) -> None: + """ + Injects a DVD screensaver-style bouncing logo loading animation overlay into the given Playwright Page. + This is used to visually indicate that the browser is setting up or waiting. + """ + await page.evaluate("""() => { + document.title = 'Setting up...'; + + // Create the main overlay + const loadingOverlay = document.createElement('div'); + loadingOverlay.id = 'pretty-loading-animation'; + loadingOverlay.style.position = 'fixed'; + loadingOverlay.style.top = '0'; + loadingOverlay.style.left = '0'; + loadingOverlay.style.width = '100vw'; + loadingOverlay.style.height = '100vh'; + loadingOverlay.style.background = '#000'; + loadingOverlay.style.zIndex = '99999'; + loadingOverlay.style.overflow = 'hidden'; + + // Create the image element + const img = document.createElement('img'); + img.src = 'https://github.com/browser-use.png'; + img.alt = 'Browser-Use'; + img.style.width = '200px'; + img.style.height = 'auto'; + img.style.position = 'absolute'; + img.style.left = '0px'; + img.style.top = '0px'; + img.style.zIndex = '2'; + img.style.opacity = '0.8'; + + loadingOverlay.appendChild(img); + document.body.appendChild(loadingOverlay); + + // DVD screensaver bounce logic + let x = Math.random() * (window.innerWidth - 300); + let y = Math.random() * (window.innerHeight - 300); + let dx = 1.2 + Math.random() * 0.4; // px per frame + let dy = 1.2 + Math.random() * 0.4; + // Randomize direction + if (Math.random() > 0.5) dx = -dx; + if (Math.random() > 0.5) dy = -dy; + + function animate() { + const imgWidth = img.offsetWidth || 300; + const imgHeight = img.offsetHeight || 300; + x += dx; + y += dy; + + if (x <= 0) { + x = 0; + dx = Math.abs(dx); + } else if (x + imgWidth >= window.innerWidth) { + x = window.innerWidth - imgWidth; + dx = -Math.abs(dx); + } + if (y <= 0) { + y = 0; + dy = Math.abs(dy); + } else if (y + imgHeight >= window.innerHeight) { + y = window.innerHeight - imgHeight; + dy = -Math.abs(dy); + } + + img.style.left = `${x}px`; + img.style.top = `${y}px`; + + requestAnimationFrame(animate); + } + animate(); + + // Responsive: update bounds on resize + window.addEventListener('resize', () => { + x = Math.min(x, window.innerWidth - img.offsetWidth); + y = Math.min(y, window.innerHeight - img.offsetHeight); + }); + + // Add a little CSS for smoothness + const style = document.createElement('style'); + style.innerHTML = ` + #pretty-loading-animation { + /*backdrop-filter: blur(2px) brightness(0.9);*/ + } + #pretty-loading-animation img { + user-select: none; + pointer-events: none; + } + `; + document.head.appendChild(style); + }""") diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py index c171c6b54..de2a65245 100644 --- a/browser_use/browser/views.py +++ b/browser_use/browser/views.py @@ -28,7 +28,7 @@ class BrowserStateSummary(DOMState): url: str title: str tabs: list[TabInfo] - screenshot: str | None = None + screenshot: str | None = field(default=None, repr=False) pixels_above: int = 0 pixels_below: int = 0 browser_errors: list[str] = field(default_factory=list) diff --git a/browser_use/controller/registry/service.py b/browser_use/controller/registry/service.py index 58a0a49ef..c58198145 100644 --- a/browser_use/controller/registry/service.py +++ b/browser_use/controller/registry/service.py @@ -1,10 +1,12 @@ import asyncio import logging +import re from collections.abc import Callable from inspect import iscoroutinefunction, signature from typing import Any, Generic, Optional, TypeVar from langchain_core.language_models.chat_models import BaseChatModel +from playwright.async_api import Page from pydantic import BaseModel, Field, create_model from browser_use.browser import BrowserSession @@ -18,13 +20,33 @@ from browser_use.telemetry.views import ( ControllerRegisteredFunctionsTelemetryEvent, RegisteredFunction, ) -from browser_use.utils import time_execution_async +from browser_use.utils import match_url_with_domain_pattern, time_execution_async Context = TypeVar('Context') logger = logging.getLogger(__name__) +class SpecialActionParameters(BaseModel): + """Model defining all special parameters that can be injected into actions""" + + model_config = {'arbitrary_types_allowed': True} + + context: Context | None = None + browser_session: BrowserSession | None = None + browser: BrowserSession | None = None # legacy support + browser_context: BrowserSession | None = None # legacy support + page: Page | None = None + page_extraction_llm: BaseChatModel | None = None + available_file_paths: list[str] | None = None + has_sensitive_data: bool = False + + @classmethod + def get_browser_requiring_params(cls) -> set[str]: + """Get parameter names that require browser_session""" + return {'browser_session', 'browser', 'browser_context', 'page'} + + class Registry(Generic[Context]): """Service for registering and managing actions""" @@ -37,14 +59,11 @@ class Registry(Generic[Context]): def _create_param_model(self, function: Callable) -> type[BaseModel]: """Creates a Pydantic model from function signature""" sig = signature(function) + special_param_names = set(SpecialActionParameters.model_fields.keys()) params = { name: (param.annotation, ... if param.default == param.empty else param.default) for name, param in sig.parameters.items() - if name != 'browser' - and name != 'page_extraction_llm' - and name != 'available_file_paths' - and name != 'browser_session' - and name != 'browser_context' + if name not in special_param_names } # TODO: make the types here work return create_model( @@ -58,9 +77,15 @@ class Registry(Generic[Context]): description: str, param_model: type[BaseModel] | None = None, domains: list[str] | None = None, + allowed_domains: list[str] | None = None, page_filter: Callable[[Any], bool] | None = None, ): """Decorator for registering actions""" + # Handle aliases: domains and allowed_domains are the same parameter + if allowed_domains is not None and domains is not None: + raise ValueError("Cannot specify both 'domains' and 'allowed_domains' - they are aliases for the same parameter") + + final_domains = allowed_domains if allowed_domains is not None else domains def decorator(func: Callable): # Skip registration if action is in exclude_actions @@ -89,7 +114,7 @@ class Registry(Generic[Context]): description=description, function=wrapped_func, param_model=actual_param_model, - domains=domains, + domains=final_domains, page_filter=page_filter, ) self.registry.actions[func.__name__] = action @@ -104,12 +129,12 @@ class Registry(Generic[Context]): params: dict, browser_session: BrowserSession | None = None, page_extraction_llm: BaseChatModel | None = None, - sensitive_data: dict[str, str] | None = None, + sensitive_data: dict[str, str | dict[str, str]] | None = None, available_file_paths: list[str] | None = None, # context: Context | None = None, ) -> Any: - """Execute a registered action""" + """Execute a registered action with enhanced parameter handling for backward compatibility""" if action_name not in self.registry.actions: raise ValueError(f'Action {action_name} not found') @@ -121,77 +146,184 @@ class Registry(Generic[Context]): except Exception as e: raise ValueError(f'Invalid parameters {params} for action {action_name}: {type(e)}: {e}') from e - # Check if the first parameter is a Pydantic model + # Analyze function signature for smart parameter injection sig = signature(action.function) parameters = list(sig.parameters.values()) - is_pydantic = parameters and issubclass(parameters[0].annotation, BaseModel) parameter_names = [param.name for param in parameters] - if sensitive_data: - validated_params = self._replace_sensitive_data(validated_params, sensitive_data) + # Check if the first parameter is a Pydantic model (using original safe logic) + # Only consider it pydantic if: + # 1. There are parameters + # 2. First parameter has a BaseModel annotation + # 3. AND the function signature actually takes a BaseModel as first param (not auto-generated) + try: + is_pydantic = ( + parameters + and len(parameters) > 0 + and hasattr(parameters[0], 'annotation') + and parameters[0].annotation != parameters[0].empty + and issubclass(parameters[0].annotation, BaseModel) + and + # Additional check: make sure the first parameter name suggests it's actually a pydantic model + parameters[0].name in ['params', 'param', 'model'] + or parameters[0].name.endswith('_model') + ) + except (TypeError, AttributeError): + is_pydantic = False - # Check if the action requires browser + if sensitive_data: + validated_params = self._replace_sensitive_data(validated_params, sensitive_data, browser_session) + + # Check if the action requires special parameters and validate they're provided if ( - 'browser_session' in parameter_names or 'browser' in parameter_names or 'browser_context' in parameter_names + 'browser_session' in parameter_names + or 'browser' in parameter_names + or 'browser_context' in parameter_names + or 'page' in parameter_names ) and not browser_session: raise ValueError(f'Action {action_name} requires browser_session but none provided.') if 'page_extraction_llm' in parameter_names and not page_extraction_llm: raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.') if 'available_file_paths' in parameter_names and not available_file_paths: raise ValueError(f'Action {action_name} requires available_file_paths but none provided.') - if 'context' in parameter_names and not context: raise ValueError(f'Action {action_name} requires context but none provided.') - # Prepare arguments based on parameter type - extra_args = {} - if 'context' in parameter_names: - extra_args['context'] = context - if 'browser_session' in parameter_names: - extra_args['browser_session'] = browser_session - if 'browser' in parameter_names: # support legacy browser: BrowserContext arg + # Create special parameters model with all available values + special_params_data = { + 'context': context, + 'browser_session': browser_session, + 'browser': browser_session, # legacy support + 'browser_context': browser_session, # legacy support + 'page_extraction_llm': page_extraction_llm, + 'available_file_paths': available_file_paths, + 'has_sensitive_data': action_name == 'input_text' and bool(sensitive_data), + } + + # Handle async page parameter if needed + if 'page' in parameter_names and browser_session: + special_params_data['page'] = await browser_session.get_current_page() + + # Create special parameters object without validation to preserve BrowserSession state + # We bypass model_validate to avoid copying BrowserSession and losing private attributes + special_params = SpecialActionParameters.model_construct(**special_params_data) + + # Log legacy usage + if 'browser' in parameter_names: logger.debug( - f'You should update this action {action_name}(browser: BrowserContext) -> to take {action_name}(browser_session: BrowserSession) instead' + f'You should update this action {action_name}(browser: BrowserContext) -> to take {action_name}(browser_session: BrowserSession) instead' ) - extra_args['browser'] = browser_session - if 'browser_context' in parameter_names: # support legacy browser: BrowserContext arg + if 'browser_context' in parameter_names: logger.debug( - f'You should update this action {action_name}(browser_context: BrowserContext) -> to take {action_name}(browser_session: BrowserSession) instead' + f'You should update this action {action_name}(browser_context: BrowserContext) -> to take {action_name}(browser_session: BrowserSession) instead' ) - extra_args['browser_context'] = browser_session - if 'page_extraction_llm' in parameter_names: - extra_args['page_extraction_llm'] = page_extraction_llm - if 'available_file_paths' in parameter_names: - extra_args['available_file_paths'] = available_file_paths - if action_name == 'input_text' and sensitive_data: - extra_args['has_sensitive_data'] = True + + # Enhanced parameter injection logic using Pydantic if is_pydantic: - return await action.function(validated_params, **extra_args) - return await action.function(**validated_params.model_dump(), **extra_args) + # For pydantic functions: function(pydantic_model, **special_params) + # Extract special parameters needed by this function (keep objects, don't serialize) + needed_special_params = set(parameter_names[1:]) & set(SpecialActionParameters.model_fields.keys()) + injection_params = {} + for param_name in needed_special_params: + value = getattr(special_params, param_name, None) + if value is not None: + injection_params[param_name] = value + + return await action.function(validated_params, **injection_params) + else: + # For individual parameter functions: function(**all_params) + # Merge user params with needed special params, avoiding conflicts + param_dict = validated_params.model_dump() + + # Extract special parameters needed by this function (keep objects, don't serialize) + needed_special_params = set(parameter_names) & set(SpecialActionParameters.model_fields.keys()) + injection_params = {} + for param_name in needed_special_params: + value = getattr(special_params, param_name, None) + if value is not None: + injection_params[param_name] = value + + # Remove any special params from user params to avoid conflicts (special params take precedence) + for param_name in injection_params: + if param_name in param_dict: + logger.debug(f'Removing {param_name} from param_dict to avoid conflict') + param_dict.pop(param_name) + + # Combine all parameters + final_params = {**param_dict, **injection_params} + return await action.function(**final_params) except Exception as e: raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e - def _replace_sensitive_data(self, params: BaseModel, sensitive_data: dict[str, str]) -> BaseModel: - """Replaces the sensitive data in the params""" - # if there are any str with placeholder in the params, replace them with the actual value from sensitive_data + def _log_sensitive_data_usage(self, placeholders_used: set[str], current_url: str | None) -> None: + """Log when sensitive data is being used on a page""" + if placeholders_used: + url_info = f' on {current_url}' if current_url and current_url != 'about:blank' else '' + logger.info(f'๐Ÿ”’ Using sensitive data placeholders: {", ".join(sorted(placeholders_used))}{url_info}') - import logging - import re + def _replace_sensitive_data( + self, params: BaseModel, sensitive_data: dict[str, Any], browser_session: BrowserSession = None + ) -> BaseModel: + """ + Replaces sensitive data placeholders in params with actual values. - logger = logging.getLogger(__name__) + Args: + params: The parameter object containing placeholder tags + sensitive_data: Dictionary of sensitive data, either in old format {key: value} + or new format {domain_pattern: {key: value}} + browser_session: Optional browser session to get the current URL for domain matching + + Returns: + BaseModel: The parameter object with placeholders replaced by actual values + """ secret_pattern = re.compile(r'(.*?)') # Set to track all missing placeholders across the full object all_missing_placeholders = set() + # Set to track successfully replaced placeholders + replaced_placeholders = set() + + # Determine current URL if browser_session is provided + current_url = None + if browser_session: + try: + # Get current URL from browser session - do this synchronously to avoid complications + loop = asyncio.get_event_loop() + current_page = loop.run_until_complete(browser_session.get_current_page()) + current_url = current_page.url if current_page else None + except Exception as e: + logger.debug(f'Failed to get current URL from browser session: {e}') + + # Process sensitive data based on format and current URL + applicable_secrets = {} + + for domain_or_key, content in sensitive_data.items(): + if isinstance(content, dict): + # New format: {domain_pattern: {key: value}} + # Only include secrets for domains that match the current URL + if current_url is None: + # No URL available, include all secrets for all domains + applicable_secrets.update(content) + elif current_url != 'about:blank': + # Don't expose domain-specific secrets on about:blank + if match_url_with_domain_pattern(current_url, domain_or_key): + applicable_secrets.update(content) + else: + # Old format: {key: value} + applicable_secrets[domain_or_key] = content + + # Filter out empty values + applicable_secrets = {k: v for k, v in applicable_secrets.items() if v} def replace_secrets(value): if isinstance(value, str): matches = secret_pattern.findall(value) for placeholder in matches: - if placeholder in sensitive_data and sensitive_data[placeholder]: - value = value.replace(f'{placeholder}', sensitive_data[placeholder]) + if placeholder in applicable_secrets: + value = value.replace(f'{placeholder}', applicable_secrets[placeholder]) + replaced_placeholders.add(placeholder) else: # Keep track of missing placeholders all_missing_placeholders.add(placeholder) @@ -207,6 +339,9 @@ class Registry(Generic[Context]): params_dump = params.model_dump() processed_params = replace_secrets(params_dump) + # Log sensitive data usage + self._log_sensitive_data_usage(replaced_placeholders, current_url) + # Log a warning if any placeholders are missing if all_missing_placeholders: logger.warning(f'Missing or empty keys in sensitive_data dictionary: {", ".join(all_missing_placeholders)}') diff --git a/browser_use/controller/registry/views.py b/browser_use/controller/registry/views.py index 8f41cb948..cd13f1cf4 100644 --- a/browser_use/controller/registry/views.py +++ b/browser_use/controller/registry/views.py @@ -76,7 +76,7 @@ class ActionRegistry(BaseModel): Match a list of domain glob patterns against a URL. Args: - domain_patterns: A list of domain patterns that can include glob patterns (* wildcard) + domains: A list of domain patterns that can include glob patterns (* wildcard) url: The URL to match against Returns: @@ -86,26 +86,13 @@ class ActionRegistry(BaseModel): if domains is None or not url: return True - import fnmatch - from urllib.parse import urlparse + # Use the centralized URL matching logic from utils + from browser_use.utils import match_url_with_domain_pattern - # Parse the URL to get the domain - try: - parsed_url = urlparse(url) - if not parsed_url.netloc: - return False - - domain = parsed_url.netloc - # Remove port if present - if ':' in domain: - domain = domain.split(':')[0] - - for domain_pattern in domains: - if fnmatch.fnmatch(domain, domain_pattern): # Perform glob *.matching.* - return True - return False - except Exception: - return False + for domain_pattern in domains: + if match_url_with_domain_pattern(url, domain_pattern): + return True + return False @staticmethod def _match_page_filter(page_filter: Callable[[Page], bool] | None, page: Page) -> bool: diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py index 08e2a6d4f..16a8f59c6 100644 --- a/browser_use/controller/service.py +++ b/browser_use/controller/service.py @@ -79,18 +79,19 @@ class Controller(Generic[Context]): # Basic Navigation Actions @self.registry.action( - 'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ', + 'Search the query in Google, the query should be a search query like humans search in Google, concrete and not vague or super long.', param_model=SearchGoogleAction, ) async def search_google(params: SearchGoogleAction, browser_session: BrowserSession): search_url = f'https://www.google.com/search?q={params.query}&udm=14' page = await browser_session.get_current_page() - if page: + if page.url in ('about:blank', 'https://www.google.com'): await page.goto(search_url) await page.wait_for_load_state() else: page = await browser_session.create_new_tab(search_url) + msg = f'๐Ÿ” Searched for "{params.query}" in Google' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) @@ -108,7 +109,7 @@ class Controller(Generic[Context]): return ActionResult(extracted_content=msg, include_in_memory=True) @self.registry.action('Go back', param_model=NoParamsAction) - async def go_back(_: NoParamsAction, browser_session: BrowserSession): + async def go_back(params: NoParamsAction, browser_session: BrowserSession): await browser_session.go_back() msg = '๐Ÿ”™ Navigated back' logger.info(msg) @@ -179,9 +180,7 @@ class Controller(Generic[Context]): return ActionResult(extracted_content=msg, include_in_memory=True) # Save PDF - @self.registry.action( - 'Save the current page as a PDF file', - ) + @self.registry.action('Save the current page as a PDF file') async def save_pdf(browser_session: BrowserSession): page = await browser_session.get_current_page() short_url = re.sub(r'^https?://(?:www\.)?|/$', '', page.url) @@ -205,7 +204,7 @@ class Controller(Generic[Context]): logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) - @self.registry.action('Open url in new tab', param_model=OpenTabAction) + @self.registry.action('Open a specific url in new tab', param_model=OpenTabAction) async def open_tab(params: OpenTabAction, browser_session: BrowserSession): await browser_session.create_new_tab(params.url) msg = f'๐Ÿ”— Opened new tab with {params.url}' @@ -218,22 +217,27 @@ class Controller(Generic[Context]): page = await browser_session.get_current_page() url = page.url await page.close() - msg = f'โŒ Closed tab #{params.page_id} with url {url}' + new_page = await browser_session.get_current_page() + new_page_idx = browser_session.tabs.index(new_page) + msg = f'โŒ Closed tab #{params.page_id} with {url}, now focused on tab #{new_page_idx} with url {new_page.url}' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) # Content Actions @self.registry.action( - 'Extract page content to retrieve specific information from the page, e.g. all company names, a specific description, all information about, links with companies in structured format or simply links', + 'Extract page content to retrieve specific information from the page, e.g. all company names, a specific description, all information about xyc, 4 links with companies in structured format. Use include_links true if the goal requires links', ) async def extract_content( - goal: str, should_strip_link_urls: bool, browser_session: BrowserSession, page_extraction_llm: BaseChatModel + goal: str, + browser_session: BrowserSession, + page_extraction_llm: BaseChatModel, + include_links: bool = False, ): page = await browser_session.get_current_page() import markdownify strip = [] - if should_strip_link_urls: + if not include_links: strip = ['a', 'img'] content = markdownify.markdownify(await page.content(), strip=strip) @@ -257,6 +261,28 @@ class Controller(Generic[Context]): logger.info(msg) return ActionResult(extracted_content=msg) + @self.registry.action( + 'Get the accessibility tree of the page in the format "role name" with the number_of_elements to return', + ) + async def get_ax_tree(number_of_elements: int, browser_session: BrowserSession): + page = await browser_session.get_current_page() + node = await page.accessibility.snapshot(interesting_only=True) + + def flatten_ax_tree(node, lines): + if not node: + return + role = node.get('role', '') + name = node.get('name', '') + lines.append(f'{role} {name}') + for child in node.get('children', []): + flatten_ax_tree(child, lines) + + lines = [] + flatten_ax_tree(node, lines) + msg = '\n'.join(lines) + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=False) + @self.registry.action( 'Scroll down the page by pixel amount - if none is given, scroll one page', param_model=ScrollAction, @@ -343,7 +369,7 @@ class Controller(Generic[Context]): if await locator.count() == 0: continue - element = await locator.first + element = locator.first is_visible = await element.is_visible() bbox = await element.bounding_box() @@ -747,8 +773,8 @@ class Controller(Generic[Context]): logger.error(error_msg) return ActionResult(error=error_msg, include_in_memory=True) - @self.registry.action('Google Sheets: Get the contents of the entire sheet', domains=['sheets.google.com']) - async def get_sheet_contents(browser_session: BrowserSession): + @self.registry.action('Google Sheets: Get the contents of the entire sheet', domains=['https://docs.google.com']) + async def read_sheet_contents(browser_session: BrowserSession): page = await browser_session.get_current_page() # select all cells @@ -760,7 +786,44 @@ class Controller(Generic[Context]): extracted_tsv = await page.evaluate('() => navigator.clipboard.readText()') return ActionResult(extracted_content=extracted_tsv, include_in_memory=True) - @self.registry.action('Google Sheets: Select a specific cell or range of cells', domains=['sheets.google.com']) + @self.registry.action('Google Sheets: Get the contents of a cell or range of cells', domains=['https://docs.google.com']) + async def read_cell_contents(browser_session: BrowserSession, cell_or_range: str): + page = await browser_session.get_current_page() + + await select_cell_or_range(browser_session, cell_or_range) + + await page.keyboard.press('ControlOrMeta+C') + await asyncio.sleep(0.1) + extracted_tsv = await page.evaluate('() => navigator.clipboard.readText()') + return ActionResult(extracted_content=extracted_tsv, include_in_memory=True) + + @self.registry.action( + 'Google Sheets: Update the content of a cell or range of cells', domains=['https://docs.google.com'] + ) + async def update_cell_contents(browser_session: BrowserSession, cell_or_range: str, new_contents_tsv: str): + page = await browser_session.get_current_page() + + await select_cell_or_range(browser_session, cell_or_range) + + # simulate paste event from clipboard with TSV content + await page.evaluate(f""" + const clipboardData = new DataTransfer(); + clipboardData.setData('text/plain', `{new_contents_tsv}`); + document.activeElement.dispatchEvent(new ClipboardEvent('paste', {{clipboardData}})); + """) + + return ActionResult(extracted_content=f'Updated cells: {cell_or_range} = {new_contents_tsv}', include_in_memory=False) + + @self.registry.action('Google Sheets: Clear whatever cells are currently selected', domains=['https://docs.google.com']) + async def clear_cell_contents(browser_session: BrowserSession, cell_or_range: str): + page = await browser_session.get_current_page() + + await select_cell_or_range(browser_session, cell_or_range) + + await page.keyboard.press('Backspace') + return ActionResult(extracted_content=f'Cleared cells: {cell_or_range}', include_in_memory=False) + + @self.registry.action('Google Sheets: Select a specific cell or range of cells', domains=['https://docs.google.com']) async def select_cell_or_range(browser_session: BrowserSession, cell_or_range: str): page = await browser_session.get_current_page() @@ -777,30 +840,13 @@ class Controller(Generic[Context]): await page.keyboard.press('Enter') await asyncio.sleep(0.2) await page.keyboard.press('Escape') # to make sure the popup still closes in the case where the jump failed - return ActionResult(extracted_content=f'Selected cell {cell_or_range}', include_in_memory=False) + return ActionResult(extracted_content=f'Selected cells: {cell_or_range}', include_in_memory=False) @self.registry.action( - 'Google Sheets: Get the contents of a specific cell or range of cells', domains=['sheets.google.com'] + 'Google Sheets: Fallback method to type text into (only one) currently selected cell', + domains=['https://docs.google.com'], ) - async def get_range_contents(browser_session: BrowserSession, cell_or_range: str): - page = await browser_session.get_current_page() - - await select_cell_or_range(browser_session, cell_or_range) - - await page.keyboard.press('ControlOrMeta+C') - await asyncio.sleep(0.1) - extracted_tsv = await page.evaluate('() => navigator.clipboard.readText()') - return ActionResult(extracted_content=extracted_tsv, include_in_memory=True) - - @self.registry.action('Google Sheets: Clear the currently selected cells', domains=['sheets.google.com']) - async def clear_selected_range(browser_session: BrowserSession): - page = await browser_session.get_current_page() - - await page.keyboard.press('Backspace') - return ActionResult(extracted_content='Cleared selected range', include_in_memory=False) - - @self.registry.action('Google Sheets: Input text into the currently selected cell', domains=['sheets.google.com']) - async def input_selected_cell_text(browser_session: BrowserSession, text: str): + async def fallback_input_into_single_selected_cell(browser_session: BrowserSession, text: str): page = await browser_session.get_current_page() await page.keyboard.type(text, delay=0.1) @@ -808,21 +854,6 @@ class Controller(Generic[Context]): await page.keyboard.press('ArrowUp') return ActionResult(extracted_content=f'Inputted text {text}', include_in_memory=False) - @self.registry.action('Google Sheets: Batch update a range of cells', domains=['sheets.google.com']) - async def update_range_contents(browser_session: BrowserSession, range: str, new_contents_tsv: str): - page = await browser_session.get_current_page() - - await select_cell_or_range(browser_session, range) - - # simulate paste event from clipboard with TSV content - await page.evaluate(f""" - const clipboardData = new DataTransfer(); - clipboardData.setData('text/plain', `{new_contents_tsv}`); - document.activeElement.dispatchEvent(new ClipboardEvent('paste', {{clipboardData}})); - """) - - return ActionResult(extracted_content=f'Updated cell {range} with {new_contents_tsv}', include_in_memory=False) - # Register --------------------------------------------------------------- def action(self, description: str, **kwargs): diff --git a/browser_use/dom/buildDomTree.js b/browser_use/dom/buildDomTree.js index 93c4fcafd..fe8d18f58 100644 --- a/browser_use/dom/buildDomTree.js +++ b/browser_use/dom/buildDomTree.js @@ -824,30 +824,12 @@ if (hasInteractiveRole) return true; - // check whether element has event listeners - try { - if (typeof getEventListeners === 'function') { - const listeners = getEventListeners(element); - const mouseEvents = ['click', 'mousedown', 'mouseup', 'dblclick']; - for (const eventType of mouseEvents) { - for (const listener of listeners) { - if (listener.type === eventType) { - return true; // Found a mouse interaction listener - } - } - } - } else { - // Fallback: Check common event attributes if getEventListeners is not available - const commonMouseAttrs = ['onclick', 'onmousedown', 'onmouseup', 'ondblclick']; - for (const attr of commonMouseAttrs) { - if (element.hasAttribute(attr) || typeof element[attr] === 'function') { - return true; - } - } + // Check common event attributes (getEventListeners doesn't work in page.evaluate context) + const commonMouseAttrs = ['onclick', 'onmousedown', 'onmouseup', 'ondblclick']; + for (const attr of commonMouseAttrs) { + if (element.hasAttribute(attr) || typeof element[attr] === 'function') { + return true; } - } catch (e) { - // console.warn(`Could not check event listeners for ${element.tagName}:`, e); - // If checking listeners fails, rely on other checks } return false @@ -1116,29 +1098,11 @@ if (element.hasAttribute('onclick') || typeof element.onclick === 'function') { return true; } - // Check for other common interaction event listeners - try { - const getEventListeners = window.getEventListenersForNode; - if (typeof getEventListeners === 'function') { - const listeners = getEventListeners(element); - const interactionEvents = ['click', 'mousedown', 'mouseup', 'keydown', 'keyup', 'submit', 'change', 'input', 'focus', 'blur']; - for (const eventType of interactionEvents) { - for (const listener of listeners) { - if (listener.type === eventType) { - return true; // Found a common interaction listener - } - } - } - } else { - // Fallback: Check common event attributes if getEventListeners is not available - const commonEventAttrs = ['onmousedown', 'onmouseup', 'onkeydown', 'onkeyup', 'onsubmit', 'onchange', 'oninput', 'onfocus', 'onblur']; - if (commonEventAttrs.some(attr => element.hasAttribute(attr))) { - return true; - } - } - } catch (e) { - // console.warn(`Could not check event listeners for ${element.tagName}:`, e); - // If checking listeners fails, rely on other checks + + // Check common event attributes (getEventListenersForNode doesn't work in page.evaluate context) + const commonEventAttrs = ['onmousedown', 'onmouseup', 'onkeydown', 'onkeyup', 'onsubmit', 'onchange', 'oninput', 'onfocus', 'onblur']; + if (commonEventAttrs.some(attr => element.hasAttribute(attr))) { + return true; } // if the element is not strictly interactive but appears clickable based on heuristic signals diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py index e75bc62b3..402d8e16e 100644 --- a/browser_use/dom/service.py +++ b/browser_use/dom/service.py @@ -1,4 +1,3 @@ -import json import logging from dataclasses import dataclass from importlib import resources @@ -105,10 +104,27 @@ class DomService: # Only log performance metrics in debug mode if debug_mode and 'perfMetrics' in eval_page: + perf = eval_page['perfMetrics'] + + # Get key metrics for summary + total_nodes = perf.get('nodeMetrics', {}).get('totalNodes', 0) + # processed_nodes = perf.get('nodeMetrics', {}).get('processedNodes', 0) + + # Count interactive elements from the DOM map + interactive_count = 0 + if 'map' in eval_page: + for node_data in eval_page['map'].values(): + if isinstance(node_data, dict) and node_data.get('isInteractive'): + interactive_count += 1 + + # Create concise summary + url_short = self.page.url[:50] + '...' if len(self.page.url) > 50 else self.page.url logger.debug( - 'DOM Tree Building Performance Metrics for: %s\n%s', - self.page.url, - json.dumps(eval_page['perfMetrics'], indent=2), + '๐Ÿ”Ž Ran buildDOMTree.js interactive element detection on: %s interactive=%d/%d', + url_short, + interactive_count, + total_nodes, + # processed_nodes, ) return await self._construct_dom_tree(eval_page) diff --git a/browser_use/dom/tests/test_accessibility_playground.py b/browser_use/dom/tests/test_accessibility_playground.py new file mode 100644 index 000000000..7ca3b2996 --- /dev/null +++ b/browser_use/dom/tests/test_accessibility_playground.py @@ -0,0 +1,95 @@ +""" +Accessibility Tree Playground for browser-use + +- Launches a browser and navigates to a target URL (default: amazon.com) +- Extracts both the full and interesting-only accessibility trees using Playwright +- Prints and saves both trees to JSON files +- Recursively prints relevant info for each node (role, name, value, description, focusable, focused, checked, selected, disabled, children count) +- Explains the difference between the accessibility tree and the DOM tree +- Notes on React/Vue/SPA apps +- Easy to modify for your own experiments + +Run with: python browser_use/dom/tests/test_accessibility_playground.py +""" + +import asyncio + +from playwright.async_api import async_playwright + +# Change this to any site you want to test + + +# Helper to recursively print relevant info from the accessibility tree +def print_ax_tree(node, depth=0): + if not node: + return + indent = ' ' * depth + info = [ + f'role={node.get("role")!r}', + f'name={node.get("name")!r}' if node.get('name') else None, + f'value={node.get("value")!r}' if node.get('value') else None, + f'desc={node.get("description")!r}' if node.get('description') else None, + f'focusable={node.get("focusable")!r}' if 'focusable' in node else None, + f'focused={node.get("focused")!r}' if 'focused' in node else None, + f'checked={node.get("checked")!r}' if 'checked' in node else None, + f'selected={node.get("selected")!r}' if 'selected' in node else None, + f'disabled={node.get("disabled")!r}' if 'disabled' in node else None, + f'children={len(node.get("children", []))}' if node.get('children') else None, + ] + print('--------------------------------') + print(indent + ', '.join([x for x in info if x])) + for child in node.get('children', []): + print_ax_tree(child, depth + 1) + + +# Helper to print all available accessibility node attributes +# Prints all key-value pairs for each node (except 'children'), then recurses into children +def print_all_fields(node, depth=0): + if not node: + return + indent = ' ' * depth + for k, v in node.items(): + if k != 'children': + print(f'{indent}{k}: {v!r}') + if 'children' in node: + print(f'{indent}children: {len(node["children"])}') + for child in node['children']: + print_all_fields(child, depth + 1) + + +def flatten_ax_tree(node, lines): + if not node: + return + role = node.get('role', '') + name = node.get('name', '') + lines.append(f'{role} {name}') + for child in node.get('children', []): + flatten_ax_tree(child, lines) + + +async def get_ax_tree(TARGET_URL): + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + print(f'Navigating to {TARGET_URL}') + await page.goto(TARGET_URL, wait_until='domcontentloaded') + + ax_tree_interesting = await page.accessibility.snapshot(interesting_only=True) + lines = [] + flatten_ax_tree(ax_tree_interesting, lines) + print(lines) + print(f'length of ax_tree_interesting: {len(lines)}') + + await browser.close() + + +if __name__ == '__main__': + TARGET_URL = [ + # 'https://amazon.com/', + # 'https://www.google.com/', + # 'https://www.facebook.com/', + # 'https://platform.openai.com/tokenizer', + 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/input/checkbox', + ] + for url in TARGET_URL: + asyncio.run(get_ax_tree(url)) diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py index bdbea35c1..dc1f86085 100644 --- a/browser_use/logging_config.py +++ b/browser_use/logging_config.py @@ -1,6 +1,7 @@ import logging import os import sys +import warnings from dotenv import load_dotenv @@ -59,6 +60,12 @@ def addLoggingLevel(levelName, levelNum, methodName=None): def setup_logging(): + # Suppress specific deprecation warnings from FAISS + warnings.filterwarnings('ignore', category=DeprecationWarning, module='faiss.loader') + warnings.filterwarnings('ignore', message='builtin type SwigPyPacked has no __module__ attribute') + warnings.filterwarnings('ignore', message='builtin type SwigPyObject has no __module__ attribute') + warnings.filterwarnings('ignore', message='builtin type swigvarlink has no __module__ attribute') + # Try to add RESULT level, but ignore if it already exists try: addLoggingLevel('RESULT', 35) # This allows ERROR, FATAL and CRITICAL @@ -110,8 +117,8 @@ def setup_logging(): logger = logging.getLogger('browser_use') # logger.info('BrowserUse logging setup complete with level %s', log_type) - # Silence third-party loggers - for logger in [ + # Silence or adjust third-party loggers + third_party_loggers = [ 'WDM', 'httpx', 'selenium', @@ -119,6 +126,8 @@ def setup_logging(): 'urllib3', 'asyncio', 'langchain', + 'langsmith', + 'langsmith.client', 'openai', 'httpcore', 'charset_normalizer', @@ -126,7 +135,12 @@ def setup_logging(): 'PIL.PngImagePlugin', 'trafilatura.htmlprocessing', 'trafilatura', - ]: - third_party = logging.getLogger(logger) + 'mem0', + 'mem0.vector_stores.faiss', + 'mem0.vector_stores', + 'mem0.memory', + ] + for logger_name in third_party_loggers: + third_party = logging.getLogger(logger_name) third_party.setLevel(logging.ERROR) third_party.propagate = False diff --git a/browser_use/utils.py b/browser_use/utils.py index d62ef31c5..865709b05 100644 --- a/browser_use/utils.py +++ b/browser_use/utils.py @@ -5,9 +5,11 @@ import platform import signal import time from collections.abc import Callable, Coroutine +from fnmatch import fnmatch from functools import wraps from sys import stderr from typing import Any, ParamSpec, TypeVar +from urllib.parse import urlparse logger = logging.getLogger(__name__) @@ -304,7 +306,9 @@ def time_execution_sync(additional_text: str = '') -> Callable[[Callable[P, R]], start_time = time.time() result = func(*args, **kwargs) execution_time = time.time() - start_time - logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds') + # Only log if execution takes more than 0.25 seconds + if execution_time > 0.25: + logger.debug(f'โณ {additional_text.strip("-")}() took {execution_time:.2f}s') return result return wrapper @@ -321,7 +325,10 @@ def time_execution_async( start_time = time.time() result = await func(*args, **kwargs) execution_time = time.time() - start_time - logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds') + # Only log if execution takes more than 0.25 seconds to avoid spamming the logs + # you can lower this threshold locally when you're doing dev work to performance optimize stuff + if execution_time > 0.25: + logger.debug(f'โณ {additional_text.strip("-")}() took {execution_time:.2f}s') return result return wrapper @@ -343,3 +350,126 @@ def singleton(cls): def check_env_variables(keys: list[str], any_or_all=all) -> bool: """Check if all required environment variables are set""" return any_or_all(os.getenv(key, '').strip() for key in keys) + + +def is_unsafe_pattern(pattern: str) -> bool: + """ + Check if a domain pattern has complex wildcards that could match too many domains. + + Args: + pattern: The domain pattern to check + + Returns: + bool: True if the pattern has unsafe wildcards, False otherwise + """ + # Extract domain part if there's a scheme + if '://' in pattern: + _, pattern = pattern.split('://', 1) + + # Remove safe patterns (*.domain and domain.*) + bare_domain = pattern.replace('.*', '').replace('*.', '') + + # If there are still wildcards, it's potentially unsafe + return '*' in bare_domain + + +def match_url_with_domain_pattern(url: str, domain_pattern: str, log_warnings: bool = False) -> bool: + """ + Check if a URL matches a domain pattern. SECURITY CRITICAL. + + Supports optional glob patterns and schemes: + - *.example.com will match sub.example.com and example.com + - *google.com will match google.com, agoogle.com, and www.google.com + - http*://example.com will match http://example.com, https://example.com + - chrome-extension://* will match chrome-extension://aaaaaaaaaaaa and chrome-extension://bbbbbbbbbbbbb + + When no scheme is specified, https is used by default for security. + For example, 'example.com' will match 'https://example.com' but not 'http://example.com'. + + Note: about:blank must be handled at the callsite, not inside this function. + + Args: + url: The URL to check + domain_pattern: Domain pattern to match against + log_warnings: Whether to log warnings about unsafe patterns + + Returns: + bool: True if the URL matches the pattern, False otherwise + """ + try: + # Note: about:blank should be handled at the callsite, not here + if url == 'about:blank': + return False + + parsed_url = urlparse(url) + + # Extract only the hostname and scheme components + scheme = parsed_url.scheme.lower() if parsed_url.scheme else '' + domain = parsed_url.hostname.lower() if parsed_url.hostname else '' + + if not scheme or not domain: + return False + + # Normalize the domain pattern + domain_pattern = domain_pattern.lower() + + # Handle pattern with scheme + if '://' in domain_pattern: + pattern_scheme, pattern_domain = domain_pattern.split('://', 1) + else: + pattern_scheme = 'https' # Default to matching only https for security + pattern_domain = domain_pattern + + # Handle port in pattern (we strip ports from patterns since we already + # extracted only the hostname from the URL) + if ':' in pattern_domain and not pattern_domain.startswith(':'): + pattern_domain = pattern_domain.split(':', 1)[0] + + # If scheme doesn't match, return False + if not fnmatch(scheme, pattern_scheme): + return False + + # Check for exact match + if pattern_domain == '*' or domain == pattern_domain: + return True + + # Handle glob patterns + if '*' in pattern_domain: + # Check for unsafe glob patterns + # First, check for patterns like *.*.domain which are unsafe + if pattern_domain.count('*.') > 1 or pattern_domain.count('.*') > 1: + if log_warnings: + logger = logging.getLogger(__name__) + logger.error(f'โ›”๏ธ Multiple wildcards in pattern=[{domain_pattern}] are not supported') + return False # Don't match unsafe patterns + + # Check for wildcards in TLD part (example.*) + if pattern_domain.endswith('.*'): + if log_warnings: + logger = logging.getLogger(__name__) + logger.error(f'โ›”๏ธ Wildcard TLDs like in pattern=[{domain_pattern}] are not supported for security') + return False # Don't match unsafe patterns + + # Then check for embedded wildcards + bare_domain = pattern_domain.replace('*.', '') + if '*' in bare_domain: + if log_warnings: + logger = logging.getLogger(__name__) + logger.error(f'โ›”๏ธ Only *.domain style patterns are supported, ignoring pattern=[{domain_pattern}]') + return False # Don't match unsafe patterns + + # Special handling so that *.google.com also matches bare google.com + if pattern_domain.startswith('*.'): + parent_domain = pattern_domain[2:] + if domain == parent_domain or fnmatch(domain, parent_domain): + return True + + # Normal case: match domain against pattern + if fnmatch(domain, pattern_domain): + return True + + return False + except Exception as e: + logger = logging.getLogger(__name__) + logger.error(f'โ›”๏ธ Error matching URL {url} with pattern {domain_pattern}: {type(e).__name__}: {e}') + return False diff --git a/debug_pydantic.py b/debug_pydantic.py new file mode 100644 index 000000000..5be286300 --- /dev/null +++ b/debug_pydantic.py @@ -0,0 +1,34 @@ +import inspect + +from pydantic import BaseModel + +from browser_use.controller.views import ClickElementAction + + +# Check the pydantic detection logic +def click_element_by_index(params: ClickElementAction, browser_session): + pass + + +sig = inspect.signature(click_element_by_index) +parameters = list(sig.parameters.values()) +parameter_names = [param.name for param in parameters] + +print('Parameters:', parameter_names) +print('First param name:', parameters[0].name) +print('First param annotation:', parameters[0].annotation) +print('Is BaseModel:', issubclass(parameters[0].annotation, BaseModel)) + +# Check the name detection logic +name_check = parameters[0].name in ['params', 'param', 'model'] or parameters[0].name.endswith('_model') +print('Name check passed:', name_check) + +is_pydantic = ( + parameters + and len(parameters) > 0 + and hasattr(parameters[0], 'annotation') + and parameters[0].annotation != parameters[0].empty + and issubclass(parameters[0].annotation, BaseModel) + and name_check +) +print('Is pydantic:', is_pydantic) diff --git a/docs/cloud/webhooks.mdx b/docs/cloud/webhooks.mdx new file mode 100644 index 000000000..1c77bbded --- /dev/null +++ b/docs/cloud/webhooks.mdx @@ -0,0 +1,111 @@ +--- +title: "Webhooks" +description: "Learn how to integrate webhooks with Browser Use Cloud API" +icon: "code" +--- + +Webhooks allow you to receive real-time notifications about events in your Browser Use tasks. This guide will show you how to set up and verify webhook endpoints. + +## Prerequisites + + + You need an active subscription to create webhooks. See your billing page + [cloud.browser-use.com/billing](https://cloud.browser-use.com/billing) + + +## Setting Up Webhooks + +To receive webhook notifications, you need to: + +1. Create an endpoint that can receive HTTPS POST requests +2. Configure your webhook URL in the Browser Use dashboard +3. Implement signature verification to ensure webhook authenticity + + + When adding a webhook URL in the dashboard, it must be a valid HTTPS URL that can receive POST requests. + On creation, we will send a test payload `{"test": "ok"}` to verify the endpoint is working correctly before creating the actual webhook! + + +## Webhook Events + +Browser Use currently only sends status updates for your running tasks: + +| Status | Description | +| -------------- | -------------------------------------- | +| `initializing` | A task is initializing | +| `started` | A Task has started (browser available) | +| `paused` | A task has been paused mid execution | +| `stopped` | A task has been stopped mid execution | +| `finished` | A task has finished | + +## Webhook Payload + +Each webhook call includes: + +- A JSON payload with event details +- `X-Browser-Use-Timestamp` header with the current timestamp +- `X-Browser-Use-Signature` header for verification + +Example payload: + +```json +{ + "session_id": "602c8809-61ee-461d-acfd-3e8783f23326", + "task_id": "b9792a06-0411-4838-96de-c720f34206a2", + "status": "initializing" +} +``` + +## Implementing Webhook Verification + +To ensure webhook authenticity, you must verify the signature. Here's an example implementation in Python using FastAPI: + +```python +import uvicorn +import hmac +import hashlib +import json +import os + +from fastapi import FastAPI, Request, HTTPException + +app = FastAPI() + +SECRET_KEY = os.environ['SECRET_KEY'] + +def verify_signature(payload: dict, timestamp: str, received_signature: str) -> bool: + message = f'{timestamp}.{json.dumps(payload, separators=(",", ":"), sort_keys=True)}' + expected_signature = hmac.new(SECRET_KEY.encode(), message.encode(), hashlib.sha256).hexdigest() + return hmac.compare_digest(expected_signature, received_signature) + +@app.post('/webhook') +async def webhook(request: Request): + body = await request.json() + + timestamp = request.headers.get('X-Browser-Use-Timestamp') + signature = request.headers.get('X-Browser-Use-Signature') + if not timestamp or not signature: + raise HTTPException(status_code=400, detail='Missing timestamp or signature') + + if not verify_signature(body, timestamp, signature): + raise HTTPException(status_code=403, detail='Invalid signature') + + print('Valid webhook call received:', body) + return {'status': 'success', 'message': 'Webhook received'} + +if __name__ == '__main__': + uvicorn.run(app, host='0.0.0.0', port=8080) +``` + +## Best Practices + +1. **Always verify signatures**: Never process webhook payloads without verifying the signature +2. **Handle retries**: Browser Use will retry failed webhook deliveries up to 5 times +3. **Respond quickly**: Return a 200 response as soon as you've verified the signature +4. **Process asynchronously**: Handle the webhook payload processing in a background task +5. **Monitor failures**: Set up monitoring for webhook delivery failures + + + Need help? Contact our support team at support@browser-use.com or join our + [Discord community](https://link.browser-use.com/discord) + diff --git a/docs/customize/sensitive-data.mdx b/docs/customize/sensitive-data.mdx index 371b0e90e..d642fde6f 100644 --- a/docs/customize/sensitive-data.mdx +++ b/docs/customize/sensitive-data.mdx @@ -10,13 +10,15 @@ When working with sensitive information like passwords, you can use the `sensiti Make sure to always set [`allowed_domains`](https://docs.browser-use.com/customize/browser-settings#restrict-urls) to restrict the domains the Agent is allowed to visit when working with sensitive data or logins. -Here's an example of how to use sensitive data: +### Basic Usage + +Here's a basic example of how to use sensitive data: ```python from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use import Agent, Browser, BrowserConfig -from browser_use.browser.context import BrowserContextConfig +from browser_use import Agent +from browser_use.browser.session import BrowserSession load_dotenv() @@ -33,9 +35,9 @@ sensitive_data = {'x_name': 'magnus', 'x_password': '12345678'} # Use the placeholder names in your task description task = 'go to x.com and login with x_name and x_password then write a post about the meaning of life' -# Configure allowed_domains that the agent should be restricted to in BrowserContextConfig -context_config = BrowserContextConfig( - allowed_domains=['example.com'], +# Configure browser session with allowed domains +browser_session = BrowserSession( + allowed_domains=['example.com'] ) # Pass the sensitive data to the agent @@ -43,11 +45,7 @@ agent = Agent( task=task, llm=llm, sensitive_data=sensitive_data, - browser=Browser( - config=BrowserConfig( - new_context_config=context_config - ) - ) + browser_session=browser_session ) async def main(): @@ -63,6 +61,79 @@ In this example: 3. When your password is visible on the current page, we replace it in the LLM input - so that the model never has it in its state. 4. The agent will be prevented from going to any site not on `example.com` to protect from prompt injection attacks and jailbreaks +### Domain-Specific Sensitive Data + +For enhanced security, you can associate sensitive data with specific domains. This ensures credentials are only used on the domains they're intended for: + +```python +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI +from browser_use import Agent +from browser_use.browser.session import BrowserSession + +load_dotenv() + +# Initialize the model +llm = ChatOpenAI( + model='gpt-4o', + temperature=0.0, +) + +# Domain-specific sensitive data +sensitive_data = { + 'https://*.google.com': {'x_email': '...', 'x_pass': '...'}, + 'chrome-extension://abcd': {'x_api_key': '...'}, + 'http*://example.com': {'x_authcode': '123123'} +} + +# Set browser session with allowed domains that match all domain patterns in sensitive_data +browser_session = BrowserSession( + allowed_domains=[ + 'https://*.google.com', + 'chrome-extension://abcd', + 'http://example.com', # Explicitly include http:// if needed + 'https://example.com' # By default, only https:// is matched + ] +) + +# Pass the sensitive data to the agent +agent = Agent( + task="Log into Google, then check my account information", + llm=llm, + sensitive_data=sensitive_data, + browser_session=browser_session +) + +async def main(): + await agent.run() + +if __name__ == '__main__': + asyncio.run(main()) +``` + +With this approach: +1. The Google credentials (`x_email` and `x_pass`) will only be used on Google domains (any subdomain) +2. The API key (`x_api_key`) will only be used in the specific Chrome extension +3. The auth code (`x_authcode`) will only be used on example.com via http or https + +### Domain Pattern Format + +Domain patterns in sensitive_data follow the same format as `allowed_domains`: + +- `example.com` - Matches only example.com +- `*.example.com` - Matches any subdomain of example.com +- `http*://example.com` - Matches both http and https protocols for example.com +- `chrome-extension://*` - Matches any Chrome extension + +> **Security Warning**: For security reasons, certain patterns are explicitly rejected: +> - Wildcards in TLD part (e.g., `example.*`) are not allowed as they could match any TLD +> - Embedded wildcards (e.g., `g*e.com`) are rejected to prevent overly broad matches +> - Multiple wildcards like `*.*.domain` are not supported to avoid security issues + +The default protocol when no scheme is specified is now `https` for enhanced security. + +The system will validate that all domain patterns used in `sensitive_data` are covered by the patterns in `allowed_domains`. + ### Missing or Empty Values When working with sensitive data, keep these details in mind: diff --git a/docs/development/contribution-guide.mdx b/docs/development/contribution-guide.mdx index adc1a3510..ccec248eb 100644 --- a/docs/development/contribution-guide.mdx +++ b/docs/development/contribution-guide.mdx @@ -4,9 +4,68 @@ description: "Learn how to contribute to Browser Use" icon: "github" --- +# Join the Browser Use Community! -- check out our most active issues or ask in [Discord](https://discord.gg/zXJJHtJf3k) for ideas of what to work on -- get inspiration / share what you build in the [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel and on [`awesome-browser-use-prompts`](https://github.com/browser-use/awesome-prompts)! -- no typo/style-only nit PRs, you can submit nit fixes but only if part of larger bugfix or new feature PRs -- include a demo screenshot/gif, tests, and ideally an example script demonstrating any changes in your PR -- bump your issues/PRs with comments periodically if you want them to be merged faster +We're thrilled you're interested in contributing to Browser Use! This guide will help you get started with contributing to our project. Your contributions are what make the open-source community such an amazing place to learn, inspire, and create. + +## Quick Setup + +Get started with Browser Use development in minutes: + +```bash +git clone https://github.com/browser-use/browser-use +cd browser-use +uv sync --all-extras --dev +# or pip install -U git+https://github.com/browser-use/browser-use.git@main + +echo "BROWSER_USE_LOGGING_LEVEL=debug" >> .env +``` + +For more detailed setup instructions, see our [Local Setup Guide](/development/local-setup). + +## How to Contribute + +### Find Something to Work On + +- Browse our [GitHub Issues](https://github.com/browser-use/browser-use/issues) for beginner-friendly issues labeled `good-first-issue` +- Check out our most active issues or ask in [Discord](https://discord.gg/zXJJHtJf3k) for ideas of what to work on +- Get inspiration and share what you build in the [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel +- Explore or contribute to [`awesome-browser-use-prompts`](https://github.com/browser-use/awesome-prompts)! + +### Making a Great Pull Request + +When submitting a pull request, please: + +- Include a clear description of what the PR does and why it's needed +- Add tests that cover your changes +- Include a demo screenshot/gif or an example script demonstrating your changes +- Make sure the PR passes all CI checks and tests +- Keep your PR focused on a single issue or feature to make it easier to review + +Note: We appreciate quality over quantity. Instead of submitting small typo/style-only PRs, consider including those fixes as part of larger bugfix or feature PRs. + +### Contribution Process + +1. Fork the repository +2. Create a new branch for your feature or bugfix +3. Make your changes +4. Run tests to ensure everything works +5. Submit a pull request +6. Respond to any feedback from maintainers +7. Celebrate your contribution! + +Feel free to bump your issues/PRs with comments periodically if you need faster feedback. + +## Code of Conduct + +We're committed to providing a welcoming and inclusive environment for all contributors. Please be respectful and constructive in all interactions. + +## Getting Help + +If you need help at any point: + +- Join our [Discord community](https://link.browser-use.com/discord) +- Ask questions in the appropriate GitHub issue +- Check our [documentation](/introduction) + +We're here to help you succeed in contributing to Browser Use! diff --git a/docs/development/local-setup.mdx b/docs/development/local-setup.mdx index b65cc7ad7..559060ee7 100644 --- a/docs/development/local-setup.mdx +++ b/docs/development/local-setup.mdx @@ -4,11 +4,45 @@ description: "Set up Browser Use development environment locally" icon: "laptop-code" --- +# Welcome to Browser Use Development! + +We're excited to have you join our community of contributors. This guide will help you set up your local development environment quickly and easily. + +## Quick Setup + +If you're familiar with Python development, here's the quick way to get started: + +```bash +git clone https://github.com/browser-use/browser-use +cd browser-use +uv sync --all-extras --dev +# or pip install -U git+https://github.com/browser-use/browser-use.git@main + +echo "BROWSER_USE_LOGGING_LEVEL=debug" >> .env +``` + +## Helper Scripts + +We provide several convenient shell scripts in the `bin/` directory to help with common development tasks: + +```bash +# Complete setup script - installs uv, creates a venv, and installs dependencies +./bin/setup.sh + +# Run all pre-commit hooks (formatting, linting, type checking) +./bin/lint.sh + +# Run the core test suite that's executed in CI +./bin/test.sh +``` + ## Prerequisites Browser Use requires Python 3.11 or higher. We recommend using [uv](https://docs.astral.sh/uv/) for Python environment management. -## Clone the Repository +## Detailed Setup Instructions + +### Clone the Repository First, clone the Browser Use repository: @@ -17,7 +51,7 @@ git clone https://github.com/browser-use/browser-use cd browser-use ``` -## Environment Setup +### Environment Setup 1. Create and activate a virtual environment: @@ -56,6 +90,7 @@ GOOGLE_API_KEY= DEEPSEEK_API_KEY= GROK_API_KEY= NOVITA_API_KEY= +BROWSER_USE_LOGGING_LEVEL=debug # Helpful for development ``` @@ -78,6 +113,8 @@ After setup, you can: ```bash # Run the linter on the whole project (must pass for PR to be allowed to merge) uv run pre-commit run --all-files +# or use our convenience script +./bin/lint.sh # Install the linter & formatter pre-commit hooks to run automatically pre-commit install --install-hooks @@ -89,7 +126,10 @@ uv run type ### Tests ```bash -# Run tests +# Run all tests that run in CI +./bin/test.sh + +# Run specific tests uv run pytest # run everything uv run pytest tests/test_controller.py # run a specific test file uv run pytest tests/test_sensitive_data.py tests/test_tab_management.py # run two test files @@ -102,7 +142,7 @@ uv run pytest tests/test_tab_management.py::TestTabManagement::test_user_changes uv build uv pip install dist/*.whl -# bush build to PyPI (automatically run by Github Actions CI) +# push build to PyPI (automatically run by Github Actions CI) uv publish ``` diff --git a/docs/mint.json b/docs/mint.json index 4f159b697..21427c903 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -77,7 +77,7 @@ }, { "group": "Cloud API", - "pages": ["cloud/quickstart", "cloud/implementation"] + "pages": ["cloud/quickstart", "cloud/implementation", "cloud/webhooks"] } ], "footerSocials": { diff --git a/examples/custom-functions/action_filters.py b/examples/custom-functions/action_filters.py index 625e9d7de..216336594 100644 --- a/examples/custom-functions/action_filters.py +++ b/examples/custom-functions/action_filters.py @@ -71,7 +71,7 @@ async def main(): """Main function to run the example""" browser_session = BrowserSession() await browser_session.start() - llm = ChatOpenAI(model_name='gpt-4o') + llm = ChatOpenAI(model='gpt-4o') # Create the agent agent = Agent( # disco mode will not be triggered on apple.com because the LLM won't be able to see that action available, it should work on Google.com though. diff --git a/examples/custom-functions/advanced_search.py b/examples/custom-functions/advanced_search.py index df4dd3c2a..9011d3463 100644 --- a/examples/custom-functions/advanced_search.py +++ b/examples/custom-functions/advanced_search.py @@ -57,7 +57,7 @@ async def search_web(query: str): # to string serp_data_str = json.dumps(serp_data) - return ActionResult(extracted_content=serp_data_str, include_in_memory=True) + return ActionResult(extracted_content=serp_data_str, include_in_memory=False) names = [ @@ -85,7 +85,7 @@ names = [ async def main(): - task = 'use search_web with "find email address of the following ETH professor:" for each of the following persons in a list of actions. Finally return the list with name and email if provided' + task = 'use search_web with "find email address of the following ETH professor:" for each of the following persons in a list of actions. Finally return the list with name and email if provided - do always 5 at once' task += '\n' + '\n'.join(names) model = ChatOpenAI(model='gpt-4o') browser_profile = BrowserProfile() diff --git a/examples/features/sensitive_data.py b/examples/features/sensitive_data.py index fa2d87a9c..3924a612b 100644 --- a/examples/features/sensitive_data.py +++ b/examples/features/sensitive_data.py @@ -17,11 +17,34 @@ llm = ChatOpenAI( model='gpt-4o', temperature=0.0, ) -# the model will see x_name and x_password, but never the actual values. -sensitive_data = {'x_name': 'my_x_name', 'x_password': 'my_x_password'} -task = 'go to x.com and login with x_name and x_password then find interesting posts and like them' +# Simple case: the model will see x_name and x_password, but never the actual values. +# sensitive_data = {'x_name': 'my_x_name', 'x_password': 'my_x_password'} -agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data) +# Advanced case: domain-specific credentials with reusable data +# Define a single credential set that can be reused +company_credentials = {'company_username': 'user@example.com', 'company_password': 'securePassword123'} + +# Map the same credentials to multiple domains for secure access control +sensitive_data = { + 'https://example.com': company_credentials, + 'https://admin.example.com': company_credentials, + 'https://*.example-staging.com': company_credentials, + 'http*://test.example.com': company_credentials, + # You can also add domain-specific credentials + 'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'}, +} +# Update task to use one of the credentials above +task = 'Go to example.com and login with company_username and company_password' + +# Always set allowed_domains when using sensitive_data for security +from browser_use.browser.session import BrowserSession + +browser_session = BrowserSession( + allowed_domains=list(sensitive_data.keys()) + + ['https://*.trusted-partner.com'] # Domain patterns from sensitive_data + additional allowed domains +) + +agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data, browser_session=browser_session) async def main(): diff --git a/examples/integrations/browserbase_stagehand.py b/examples/integrations/browserbase_stagehand.py new file mode 100644 index 000000000..e05a12636 --- /dev/null +++ b/examples/integrations/browserbase_stagehand.py @@ -0,0 +1,58 @@ +import asyncio +import os + +from dotenv import load_dotenv + +load_dotenv() + +from stagehand import Stagehand, StagehandConfig + +from browser_use.agent.service import Agent + + +async def main(): + # Configure Stagehand + # https://pypi.org/project/stagehand-py/ + # https://github.com/browserbase/stagehand-python-examples/blob/main/agent_example.py + config = StagehandConfig( + env='BROWSERBASE', + api_key=os.getenv('BROWSERBASE_API_KEY'), + project_id=os.getenv('BROWSERBASE_PROJECT_ID'), + headless=False, + dom_settle_timeout_ms=3000, + model_name='gpt-4o', + self_heal=True, + wait_for_captcha_solves=True, + system_prompt='You are a browser automation assistant that helps users navigate websites effectively.', + model_client_options={'model_api_key': os.getenv('OPENAI_API_KEY')}, + verbose=2, + ) + + # Create a Stagehand client using the configuration object. + stagehand = Stagehand( + config=config, + model_api_key=os.getenv('OPENAI_API_KEY'), + # server_url=os.getenv('STAGEHAND_SERVER_URL'), + ) + + # Initialize - this creates a new session automatically. + await stagehand.init() + print(f'\nCreated new session: {stagehand.session_id}') + print(f'๐ŸŒ View your live browser: https://www.browserbase.com/sessions/{stagehand.session_id}') + + await stagehand.page.goto('https://google.com/') + + await stagehand.page.act('search for openai') + + # Combine with Browser Use + agent = Agent(task='click the first result', page=stagehand.page) + await agent.run() + + # go back and forth + await stagehand.page.act('open the 3 first links on the page in new tabs') + + await Agent(task='click the first result', page=stagehand.page).run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/models/azure_openai.py b/examples/models/azure_openai.py index fafc55e10..1e8da90b9 100644 --- a/examples/models/azure_openai.py +++ b/examples/models/azure_openai.py @@ -27,10 +27,9 @@ if not azure_openai_api_key or not azure_openai_endpoint: # Initialize the Azure OpenAI client llm = AzureChatOpenAI( - model_name='gpt-4o', - openai_api_key=azure_openai_api_key, + model='gpt-4o', + api_key=azure_openai_api_key, azure_endpoint=azure_openai_endpoint, # Corrected to use azure_endpoint instead of openai_api_base - deployment_name='gpt-4o', # Use deployment_name for Azure models api_version='2024-08-01-preview', # Explicitly set the API version here ) diff --git a/examples/simple.py b/examples/simple.py index de29003ba..c42bd1dda 100644 --- a/examples/simple.py +++ b/examples/simple.py @@ -17,8 +17,7 @@ llm = ChatOpenAI( model='gpt-4o', temperature=0.0, ) -task = 'Go to kayak.com and find the cheapest flight from Zurich to San Francisco on 2025-05-01' - +task = 'Go to kayak.com and find the cheapest one-way flight from Zurich to San Francisco in 3 weeks.' agent = Agent(task=task, llm=llm) diff --git a/examples/use-cases/google_sheets.py b/examples/use-cases/google_sheets.py index f29663a4d..fda5d1673 100644 --- a/examples/use-cases/google_sheets.py +++ b/examples/use-cases/google_sheets.py @@ -4,11 +4,10 @@ import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import asyncio -import pyperclip from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use import ActionResult, Agent, Controller +from browser_use import Agent, Controller from browser_use.browser import BrowserProfile, BrowserSession # Load environment variables @@ -16,106 +15,17 @@ load_dotenv() if not os.getenv('OPENAI_API_KEY'): raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') - +# Use the default controller with built-in Google Sheets actions +# The controller already includes all the necessary Google Sheets actions: +# - select_cell_or_range: Select specific cells or ranges (Ctrl+G navigation) +# - get_range_contents: Get contents of cells using clipboard +# - get_sheet_contents: Get entire sheet contents +# - clear_selected_range: Clear selected cells +# - input_selected_cell_text: Input text into selected cells +# - update_range_contents: Batch update ranges with TSV data controller = Controller() - -def is_google_sheet(page) -> bool: - return page.url.startswith('https://docs.google.com/spreadsheets/') - - -@controller.registry.action('Google Sheets: Open a specific Google Sheet') -async def open_google_sheet(browser_session: BrowserSession, google_sheet_url: str): - page = await browser_session.get_current_page() - if page.url != google_sheet_url: - await page.goto(google_sheet_url) - await page.wait_for_load_state() - if not is_google_sheet(page): - return ActionResult(error='Failed to open Google Sheet, are you sure you have permissions to access this sheet?') - return ActionResult(extracted_content=f'Opened Google Sheet {google_sheet_url}', include_in_memory=False) - - -@controller.registry.action('Google Sheets: Get the contents of the entire sheet', page_filter=is_google_sheet) -async def get_sheet_contents(browser_session: BrowserSession): - page = await browser_session.get_current_page() - - # select all cells - await page.keyboard.press('Enter') - await page.keyboard.press('Escape') - await page.keyboard.press('ControlOrMeta+A') - await page.keyboard.press('ControlOrMeta+C') - - extracted_tsv = pyperclip.paste() - return ActionResult(extracted_content=extracted_tsv, include_in_memory=True) - - -@controller.registry.action('Google Sheets: Select a specific cell or range of cells', page_filter=is_google_sheet) -async def select_cell_or_range(browser_session: BrowserSession, cell_or_range: str): - page = await browser_session.get_current_page() - - await page.keyboard.press('Enter') # make sure we dont delete current cell contents if we were last editing - await page.keyboard.press('Escape') # to clear current focus (otherwise select range popup is additive) - await asyncio.sleep(0.1) - await page.keyboard.press('Home') # move cursor to the top left of the sheet first - await page.keyboard.press('ArrowUp') - await asyncio.sleep(0.1) - await page.keyboard.press('Control+G') # open the goto range popup - await asyncio.sleep(0.2) - await page.keyboard.type(cell_or_range, delay=0.05) - await asyncio.sleep(0.2) - await page.keyboard.press('Enter') - await asyncio.sleep(0.2) - await page.keyboard.press('Escape') # to make sure the popup still closes in the case where the jump failed - return ActionResult(extracted_content=f'Selected cell {cell_or_range}', include_in_memory=False) - - -@controller.registry.action('Google Sheets: Get the contents of a specific cell or range of cells', page_filter=is_google_sheet) -async def get_range_contents(browser_session: BrowserSession, cell_or_range: str): - page = await browser_session.get_current_page() - - await select_cell_or_range(browser_session, cell_or_range) - - await page.keyboard.press('ControlOrMeta+C') - await asyncio.sleep(0.1) - extracted_tsv = pyperclip.paste() - return ActionResult(extracted_content=extracted_tsv, include_in_memory=True) - - -@controller.registry.action('Google Sheets: Clear the currently selected cells', page_filter=is_google_sheet) -async def clear_selected_range(browser_session: BrowserSession): - page = await browser_session.get_current_page() - - await page.keyboard.press('Backspace') - return ActionResult(extracted_content='Cleared selected range', include_in_memory=False) - - -@controller.registry.action('Google Sheets: Input text into the currently selected cell', page_filter=is_google_sheet) -async def input_selected_cell_text(browser_session: BrowserSession, text: str): - page = await browser_session.get_current_page() - - await page.keyboard.type(text, delay=0.1) - await page.keyboard.press('Enter') # make sure to commit the input so it doesn't get overwritten by the next action - await page.keyboard.press('ArrowUp') - return ActionResult(extracted_content=f'Inputted text {text}', include_in_memory=False) - - -@controller.registry.action('Google Sheets: Batch update a range of cells', page_filter=is_google_sheet) -async def update_range_contents(browser_session: BrowserSession, range: str, new_contents_tsv: str): - page = await browser_session.get_current_page() - - await select_cell_or_range(browser_session, range) - - # simulate paste event from clipboard with TSV content - await page.evaluate(f""" - const clipboardData = new DataTransfer(); - clipboardData.setData('text/plain', `{new_contents_tsv}`); - document.activeElement.dispatchEvent(new ClipboardEvent('paste', {{clipboardData}})); - """) - - return ActionResult(extracted_content=f'Updated cell {range} with {new_contents_tsv}', include_in_memory=False) - - -# many more snippets for keyboard-shortcut based Google Sheets automation can be found here, see: +# For more Google Sheets keyboard shortcuts and automation ideas, see: # - https://github.com/philc/sheetkeys/blob/master/content_scripts/sheet_actions.js # - https://github.com/philc/sheetkeys/blob/master/content_scripts/commands.js # - https://support.google.com/docs/answer/181110?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac-shortcuts @@ -129,7 +39,8 @@ async def main(): browser_profile=BrowserProfile( executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', user_data_dir='~/.config/browseruse/profiles/default', - ) + ), + keep_alive=True, ) async with browser_session: @@ -137,7 +48,7 @@ async def main(): eraser = Agent( task=""" - Clear all the existing values in columns A through F in this Google Sheet: + Clear all the existing values in columns A through M in this Google Sheet: https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit """, llm=model, @@ -148,15 +59,16 @@ async def main(): researcher = Agent( task=""" - Google to find the full name, nationality, and date of birth of the CEO of the top 10 Fortune 100 companies. - For each company, append a row to this existing Google Sheet: https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit + Open this Google Sheet and read it to understand the structure: https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit Make sure column headers are present and all existing values in the sheet are formatted correctly. Columns: A: Company Name B: CEO Full Name C: CEO Country of Birth - D: CEO Date of Birth (YYYY-MM-DD) - E: Source URL where the information was found + D: Source URL where the information was found + Then Google to find the full name and nationality of the CEO of the top 10 Fortune 100 companies. + For each company, append a row to this existing Google Sheet. + At the end, double check the formatting and structure and fix any issues by updating/overwriting cells. """, llm=model, browser_session=browser_session, @@ -175,17 +87,17 @@ async def main(): ) await improvised_continuer.run() - final_fact_checker = Agent( - task=""" - Read the Google Sheet https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit - Fact-check every entry, add a new column F with your findings for each row. - Make sure to check the source URL for each row, and make sure the information is correct. - """, - llm=model, - browser_session=browser_session, - controller=controller, - ) - await final_fact_checker.run() + # final_fact_checker = Agent( + # task=""" + # Read the Google Sheet https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit + # Fact-check every entry, add a new column F with your findings for each row. + # Make sure to check the source URL for each row, and make sure the information is correct. + # """, + # llm=model, + # browser_session=browser_session, + # controller=controller, + # ) + # await final_fact_checker.run() if __name__ == '__main__': diff --git a/pyproject.toml b/pyproject.toml index 0b07511fd..65b98c1a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,8 @@ examples = [ # botocore: only needed for Bedrock Claude boto3 examples/models/bedrock_claude.py "botocore>=1.37.23", "imgcat>=0.6.0", + "stagehand-py>=0.3.6", + "browserbase>=0.4.0", ] all = [ "browser-use[memory,cli,examples]", diff --git a/tests/ci/test_action_registry.py b/tests/ci/test_action_registry.py new file mode 100644 index 000000000..537db48c5 --- /dev/null +++ b/tests/ci/test_action_registry.py @@ -0,0 +1,625 @@ +""" +Comprehensive tests for the action registry system to ensure backward compatibility +and proper parameter handling for all existing patterns. + +Tests cover: +1. Existing parameter patterns (individual params, pydantic models) +2. Special parameter injection (browser_session, page_extraction_llm, etc.) +3. Action-to-action calling scenarios +4. Mixed parameter patterns +5. Registry execution edge cases +""" + +import asyncio +import logging + +import pytest +from playwright.async_api import Page +from pydantic import Field +from pytest_httpserver import HTTPServer + +from browser_use.agent.views import ActionResult +from browser_use.browser import BrowserSession +from browser_use.controller.registry.service import Registry +from browser_use.controller.registry.views import ActionModel as BaseActionModel +from browser_use.controller.views import ( + ClickElementAction, + InputTextAction, + NoParamsAction, + SearchGoogleAction, +) + +# Configure logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +class MockLLM: + """Mock LLM for testing""" + + async def ainvoke(self, prompt): + class MockResponse: + content = 'Mocked LLM response' + + return MockResponse() + + +class TestContext: + """Simple context for testing""" + + pass + + +# Test parameter models +class SimpleParams(BaseActionModel): + """Simple parameter model""" + + value: str = Field(description='Test value') + + +class ComplexParams(BaseActionModel): + """Complex parameter model with multiple fields""" + + text: str = Field(description='Text input') + number: int = Field(description='Number input', default=42) + optional_flag: bool = Field(description='Optional boolean', default=False) + + +# Test fixtures +@pytest.fixture(scope='module') +def event_loop(): + """Create and provide an event loop for async tests.""" + loop = asyncio.get_event_loop_policy().new_event_loop() + yield loop + loop.close() + + +@pytest.fixture(scope='module') +def http_server(): + """Create and provide a test HTTP server that serves static content.""" + server = HTTPServer() + server.start() + + # Add a simple test page + server.expect_request('/test').respond_with_data( + 'Test Page

Test Page

Hello from test page

', + content_type='text/html', + ) + + yield server + server.stop() + + +@pytest.fixture +def base_url(http_server): + """Return the base URL for the test HTTP server.""" + return f'http://{http_server.host}:{http_server.port}' + + +@pytest.fixture(scope='module') +async def browser_session(event_loop): + """Create and provide a real BrowserSession instance.""" + browser_session = BrowserSession( + headless=True, + user_data_dir=None, + ) + await browser_session.start() + yield browser_session + await browser_session.stop() + + +@pytest.fixture +def mock_llm(): + """Create a mock LLM""" + return MockLLM() + + +@pytest.fixture +def registry(): + """Create a fresh registry for each test""" + return Registry[TestContext]() + + +@pytest.fixture +async def test_browser(base_url): + """Create a real BrowserSession for testing""" + browser_session = BrowserSession( + headless=True, + user_data_dir=None, + ) + await browser_session.start() + # Navigate to test page + await browser_session.create_new_tab(f'{base_url}/test') + yield browser_session + await browser_session.stop() + + +class TestActionRegistryParameterPatterns: + """Test different parameter patterns that should all continue to work""" + + @pytest.mark.asyncio + async def test_individual_parameters_no_browser(self, registry): + """Test action with individual parameters, no special injection""" + + @registry.action('Simple action with individual params') + async def simple_action(text: str, number: int = 10): + return ActionResult(extracted_content=f'Text: {text}, Number: {number}') + + # Test execution + result = await registry.execute_action('simple_action', {'text': 'hello', 'number': 42}) + + assert isinstance(result, ActionResult) + assert 'Text: hello, Number: 42' in result.extracted_content + + @pytest.mark.asyncio + async def test_individual_parameters_with_browser(self, registry, browser_session, base_url): + """Test action with individual parameters plus browser_session injection""" + + @registry.action('Action with individual params and browser') + async def action_with_browser(text: str, browser_session: BrowserSession): + page = await browser_session.get_current_page() + return ActionResult(extracted_content=f'Text: {text}, URL: {page.url}') + + # Navigate to test page first + await browser_session.create_new_tab(f'{base_url}/test') + + # Test execution + result = await registry.execute_action('action_with_browser', {'text': 'hello'}, browser_session=browser_session) + + assert isinstance(result, ActionResult) + assert 'Text: hello, URL:' in result.extracted_content + assert base_url in result.extracted_content + + @pytest.mark.asyncio + async def test_page_parameter_injection(self, registry, browser_session, base_url): + """Test action with direct Page parameter injection""" + + @registry.action('Action with page parameter') + async def action_with_page(text: str, page: Page): + title = await page.title() + return ActionResult(extracted_content=f'Text: {text}, Page Title: {title}') + + # Navigate to test page first + await browser_session.create_new_tab(f'{base_url}/test') + + # Test execution + result = await registry.execute_action('action_with_page', {'text': 'hello'}, browser_session=browser_session) + + assert isinstance(result, ActionResult) + assert 'Text: hello, Page Title: Test Page' in result.extracted_content + + @pytest.mark.asyncio + async def test_pydantic_model_with_page_parameter(self, registry, browser_session, base_url): + """Test pydantic model action with page parameter injection""" + + @registry.action('Pydantic action with page', param_model=ComplexParams) + async def pydantic_action_with_page(params: ComplexParams, page: Page): + title = await page.title() + return ActionResult(extracted_content=f'Text: {params.text}, Number: {params.number}, Page Title: {title}') + + # Navigate to test page first + await browser_session.create_new_tab(f'{base_url}/test') + + # Test execution + result = await registry.execute_action( + 'pydantic_action_with_page', {'text': 'test', 'number': 100}, browser_session=browser_session + ) + + assert isinstance(result, ActionResult) + assert 'Text: test, Number: 100, Page Title: Test Page' in result.extracted_content + + @pytest.mark.asyncio + async def test_pydantic_model_parameters(self, registry, browser_session, base_url): + """Test action that takes a pydantic model as first parameter""" + + @registry.action('Action with pydantic model', param_model=ComplexParams) + async def pydantic_action(params: ComplexParams, browser_session: BrowserSession): + page = await browser_session.get_current_page() + return ActionResult( + extracted_content=f'Text: {params.text}, Number: {params.number}, Flag: {params.optional_flag}, URL: {page.url}' + ) + + # Navigate to test page first + await browser_session.create_new_tab(f'{base_url}/test') + + # Test execution + result = await registry.execute_action( + 'pydantic_action', {'text': 'test', 'number': 100, 'optional_flag': True}, browser_session=browser_session + ) + + assert isinstance(result, ActionResult) + assert 'Text: test, Number: 100, Flag: True' in result.extracted_content + assert base_url in result.extracted_content + + @pytest.mark.asyncio + async def test_mixed_special_parameters(self, registry, browser_session, base_url, mock_llm): + """Test action with multiple special injected parameters""" + + @registry.action('Action with multiple special params') + async def multi_special_action( + text: str, + browser_session: BrowserSession, + page_extraction_llm: MockLLM, + available_file_paths: list[str] | None = None, + ): + page = await browser_session.get_current_page() + llm_response = await page_extraction_llm.ainvoke('test') + files = available_file_paths or [] + + return ActionResult( + extracted_content=f'Text: {text}, URL: {page.url}, LLM: {llm_response.content}, Files: {len(files)}' + ) + + # Navigate to test page first + await browser_session.create_new_tab(f'{base_url}/test') + + # Test execution + result = await registry.execute_action( + 'multi_special_action', + {'text': 'hello'}, + browser_session=browser_session, + page_extraction_llm=mock_llm, + available_file_paths=['file1.txt', 'file2.txt'], + ) + + assert isinstance(result, ActionResult) + assert 'Text: hello' in result.extracted_content + assert base_url in result.extracted_content + assert 'LLM: Mocked LLM response' in result.extracted_content + assert 'Files: 2' in result.extracted_content + + @pytest.mark.asyncio + async def test_no_params_action(self, registry, test_browser): + """Test action with NoParamsAction model""" + + @registry.action('No params action', param_model=NoParamsAction) + async def no_params_action(params: NoParamsAction, browser_session: BrowserSession): + page = await browser_session.get_current_page() + return ActionResult(extracted_content=f'No params action executed on {page.url}') + + # Test execution with any parameters (should be ignored) + result = await registry.execute_action( + 'no_params_action', {'random': 'data', 'should': 'be', 'ignored': True}, browser_session=test_browser + ) + + assert isinstance(result, ActionResult) + assert 'No params action executed on' in result.extracted_content + assert '/test' in result.extracted_content + + @pytest.mark.asyncio + async def test_legacy_browser_parameter_names(self, registry, test_browser): + """Test that legacy browser parameter names still work""" + + @registry.action('Action with legacy browser param') + async def legacy_browser_action(text: str, browser: BrowserSession): + page = await browser.get_current_page() + return ActionResult(extracted_content=f'Legacy browser: {text}, URL: {page.url}') + + @registry.action('Action with legacy browser_context param') + async def legacy_context_action(text: str, browser_context: BrowserSession): + page = await browser_context.get_current_page() + return ActionResult(extracted_content=f'Legacy context: {text}, URL: {page.url}') + + # Test legacy browser parameter + result1 = await registry.execute_action('legacy_browser_action', {'text': 'test1'}, browser_session=test_browser) + assert 'Legacy browser: test1, URL:' in result1.extracted_content + assert '/test' in result1.extracted_content + + # Test legacy browser_context parameter + result2 = await registry.execute_action('legacy_context_action', {'text': 'test2'}, browser_session=test_browser) + assert 'Legacy context: test2, URL:' in result2.extracted_content + assert '/test' in result2.extracted_content + + +class TestActionToActionCalling: + """Test scenarios where actions call other actions""" + + @pytest.mark.asyncio + async def test_action_calling_action_with_kwargs(self, registry, test_browser): + """Test action calling another action using kwargs (current problematic pattern)""" + + # Helper function that actions can call + async def helper_function(browser_session: BrowserSession, data: str): + page = await browser_session.get_current_page() + return f'Helper processed: {data} on {page.url}' + + @registry.action('First action') + async def first_action(text: str, browser_session: BrowserSession): + # This should work without parameter conflicts + result = await helper_function(browser_session=browser_session, data=text) + return ActionResult(extracted_content=f'First: {result}') + + @registry.action('Calling action') + async def calling_action(message: str, browser_session: BrowserSession): + # Call the first action through the registry (simulates action-to-action calling) + intermediate_result = await registry.execute_action( + 'first_action', {'text': message}, browser_session=browser_session + ) + return ActionResult(extracted_content=f'Called result: {intermediate_result.extracted_content}') + + # Test the calling chain + result = await registry.execute_action('calling_action', {'message': 'test'}, browser_session=test_browser) + + assert isinstance(result, ActionResult) + assert 'Called result: First: Helper processed: test on' in result.extracted_content + assert '/test' in result.extracted_content + + @pytest.mark.asyncio + async def test_google_sheets_style_calling_pattern(self, registry, test_browser): + """Test the specific pattern from Google Sheets actions that causes the error""" + + # Simulate the _select_cell_or_range helper function + async def _select_cell_or_range(browser_session: BrowserSession, cell_or_range: str): + page = await browser_session.get_current_page() + return ActionResult(extracted_content=f'Selected cell {cell_or_range} on {page.url}') + + @registry.action('Select cell or range') + async def select_cell_or_range(browser_session: BrowserSession, cell_or_range: str): + # This is the PROBLEMATIC pattern that currently fails + # Passing browser_session by name causes "multiple values for argument" error + return await _select_cell_or_range(browser_session=browser_session, cell_or_range=cell_or_range) + + @registry.action('Select cell or range (fixed)') + async def select_cell_or_range_fixed(browser_session: BrowserSession, cell_or_range: str): + # This is the WORKING pattern using positional args + return await _select_cell_or_range(browser_session, cell_or_range) + + @registry.action('Update range contents') + async def update_range_contents(browser_session: BrowserSession, range_name: str, new_contents: str): + # This action calls select_cell_or_range, simulating the real Google Sheets pattern + await select_cell_or_range_fixed(browser_session, range_name) # Should use positional args + return ActionResult(extracted_content=f'Updated range {range_name} with {new_contents}') + + # Test the fixed version (should work) + result_fixed = await registry.execute_action( + 'select_cell_or_range_fixed', {'cell_or_range': 'A1:F100'}, browser_session=test_browser + ) + assert 'Selected cell A1:F100 on' in result_fixed.extracted_content + assert '/test' in result_fixed.extracted_content + + # Test the chained calling pattern + result_chain = await registry.execute_action( + 'update_range_contents', {'range_name': 'B2:D4', 'new_contents': 'test data'}, browser_session=test_browser + ) + assert 'Updated range B2:D4 with test data' in result_chain.extracted_content + + # Test the problematic version (may fail with current registry, should work with enhanced registry) + try: + result_problematic = await registry.execute_action( + 'select_cell_or_range', {'cell_or_range': 'A1:F100'}, browser_session=test_browser + ) + # If this succeeds, great! The enhanced registry is working + assert 'Selected cell A1:F100 on' in result_problematic.extracted_content + assert '/test' in result_problematic.extracted_content + except TypeError as e: + # This is the expected error with the current registry + assert 'multiple values for argument' in str(e) or 'got multiple values' in str(e) + logger.info(f'Expected error with current registry: {e}') + + @pytest.mark.asyncio + async def test_complex_action_chain(self, registry, test_browser): + """Test a complex chain of actions calling other actions""" + + @registry.action('Base action') + async def base_action(value: str, browser_session: BrowserSession): + page = await browser_session.get_current_page() + return ActionResult(extracted_content=f'Base: {value} on {page.url}') + + @registry.action('Middle action') + async def middle_action(input_val: str, browser_session: BrowserSession): + # Call base action + base_result = await registry.execute_action( + 'base_action', {'value': f'processed-{input_val}'}, browser_session=browser_session + ) + return ActionResult(extracted_content=f'Middle: {base_result.extracted_content}') + + @registry.action('Top action') + async def top_action(original: str, browser_session: BrowserSession): + # Call middle action + middle_result = await registry.execute_action( + 'middle_action', {'input_val': f'enhanced-{original}'}, browser_session=browser_session + ) + return ActionResult(extracted_content=f'Top: {middle_result.extracted_content}') + + # Test the full chain + result = await registry.execute_action('top_action', {'original': 'test'}, browser_session=test_browser) + + assert isinstance(result, ActionResult) + assert 'Top: Middle: Base: processed-enhanced-test on' in result.extracted_content + assert '/test' in result.extracted_content + + +class TestRegistryEdgeCases: + """Test edge cases and error conditions""" + + @pytest.mark.asyncio + async def test_missing_required_browser_session(self, registry): + """Test that actions requiring browser_session fail appropriately when not provided""" + + @registry.action('Requires browser') + async def requires_browser(text: str, browser_session: BrowserSession): + page = await browser_session.get_current_page() + return ActionResult(extracted_content=f'Text: {text}, URL: {page.url}') + + # Should raise RuntimeError when browser_session is required but not provided + with pytest.raises(RuntimeError, match='requires browser_session but none provided'): + await registry.execute_action( + 'requires_browser', + {'text': 'test'}, + # No browser_session provided + ) + + @pytest.mark.asyncio + async def test_missing_required_llm(self, registry, test_browser): + """Test that actions requiring page_extraction_llm fail appropriately when not provided""" + + @registry.action('Requires LLM') + async def requires_llm(text: str, browser_session: BrowserSession, page_extraction_llm: MockLLM): + page = await browser_session.get_current_page() + llm_response = await page_extraction_llm.ainvoke('test') + return ActionResult(extracted_content=f'Text: {text}, LLM: {llm_response.content}') + + # Should raise RuntimeError when page_extraction_llm is required but not provided + with pytest.raises(RuntimeError, match='requires page_extraction_llm but none provided'): + await registry.execute_action( + 'requires_llm', + {'text': 'test'}, + browser_session=test_browser, + # No page_extraction_llm provided + ) + + @pytest.mark.asyncio + async def test_invalid_parameters(self, registry, test_browser): + """Test handling of invalid parameters""" + + @registry.action('Typed action') + async def typed_action(number: int, browser_session: BrowserSession): + return ActionResult(extracted_content=f'Number: {number}') + + # Should raise RuntimeError when parameter validation fails + with pytest.raises(RuntimeError, match='Invalid parameters'): + await registry.execute_action( + 'typed_action', + {'number': 'not a number'}, # Invalid type + browser_session=test_browser, + ) + + @pytest.mark.asyncio + async def test_nonexistent_action(self, registry, test_browser): + """Test calling a non-existent action""" + + with pytest.raises(ValueError, match='Action nonexistent_action not found'): + await registry.execute_action('nonexistent_action', {'param': 'value'}, browser_session=test_browser) + + @pytest.mark.asyncio + async def test_sync_action_wrapper(self, registry, test_browser): + """Test that sync functions are properly wrapped to be async""" + + @registry.action('Sync action') + def sync_action(text: str, browser_session: BrowserSession): + # This is a sync function that should be wrapped + return ActionResult(extracted_content=f'Sync: {text}') + + # Should work even though the original function is sync + result = await registry.execute_action('sync_action', {'text': 'test'}, browser_session=test_browser) + + assert isinstance(result, ActionResult) + assert 'Sync: test' in result.extracted_content + + @pytest.mark.asyncio + async def test_excluded_actions(self, test_browser): + """Test that excluded actions are not registered""" + + registry_with_exclusions = Registry[TestContext](exclude_actions=['excluded_action']) + + @registry_with_exclusions.action('Excluded action') + async def excluded_action(text: str): + return ActionResult(extracted_content=f'Should not execute: {text}') + + @registry_with_exclusions.action('Included action') + async def included_action(text: str): + return ActionResult(extracted_content=f'Should execute: {text}') + + # Excluded action should not be in registry + assert 'excluded_action' not in registry_with_exclusions.registry.actions + assert 'included_action' in registry_with_exclusions.registry.actions + + # Should raise error when trying to execute excluded action + with pytest.raises(ValueError, match='Action excluded_action not found'): + await registry_with_exclusions.execute_action('excluded_action', {'text': 'test'}) + + # Included action should work + result = await registry_with_exclusions.execute_action('included_action', {'text': 'test'}) + assert 'Should execute: test' in result.extracted_content + + +class TestExistingControllerActions: + """Test that existing controller actions continue to work""" + + @pytest.mark.asyncio + async def test_existing_action_models(self, registry, test_browser): + """Test that existing action parameter models work correctly""" + + @registry.action('Test search', param_model=SearchGoogleAction) + async def test_search(params: SearchGoogleAction, browser_session: BrowserSession): + return ActionResult(extracted_content=f'Searched for: {params.query}') + + @registry.action('Test click', param_model=ClickElementAction) + async def test_click(params: ClickElementAction, browser_session: BrowserSession): + return ActionResult(extracted_content=f'Clicked element: {params.index}') + + @registry.action('Test input', param_model=InputTextAction) + async def test_input(params: InputTextAction, browser_session: BrowserSession): + return ActionResult(extracted_content=f'Input text: {params.text} at index: {params.index}') + + # Test SearchGoogleAction + result1 = await registry.execute_action('test_search', {'query': 'python testing'}, browser_session=test_browser) + assert 'Searched for: python testing' in result1.extracted_content + + # Test ClickElementAction + result2 = await registry.execute_action('test_click', {'index': 42}, browser_session=test_browser) + assert 'Clicked element: 42' in result2.extracted_content + + # Test InputTextAction + result3 = await registry.execute_action('test_input', {'index': 5, 'text': 'test input'}, browser_session=test_browser) + assert 'Input text: test input at index: 5' in result3.extracted_content + + @pytest.mark.asyncio + async def test_pydantic_vs_individual_params_consistency(self, registry, test_browser): + """Test that pydantic and individual parameter patterns produce consistent results""" + + # Action using individual parameters + @registry.action('Individual params') + async def individual_params_action(text: str, number: int, browser_session: BrowserSession): + return ActionResult(extracted_content=f'Individual: {text}-{number}') + + # Action using pydantic model + class TestParams(BaseActionModel): + text: str + number: int + + @registry.action('Pydantic params', param_model=TestParams) + async def pydantic_params_action(params: TestParams, browser_session: BrowserSession): + return ActionResult(extracted_content=f'Pydantic: {params.text}-{params.number}') + + # Both should produce similar results + test_data = {'text': 'hello', 'number': 42} + + result1 = await registry.execute_action('individual_params_action', test_data, browser_session=test_browser) + + result2 = await registry.execute_action('pydantic_params_action', test_data, browser_session=test_browser) + + # Both should extract the same content (just different prefixes) + assert 'hello-42' in result1.extracted_content + assert 'hello-42' in result2.extracted_content + assert 'Individual:' in result1.extracted_content + assert 'Pydantic:' in result2.extracted_content + + +# Test runner for manual execution +if __name__ == '__main__': + # Run a simple test manually + import asyncio + + async def manual_test(): + """Manual test runner for debugging""" + print('Running manual test...') + + registry = Registry[TestContext]() + browser_session = BrowserSession(headless=True) + await browser_session.start() + await browser_session.create_new_tab('https://example.com') + + @registry.action('Manual test action') + async def manual_action(text: str, browser_session: BrowserSession): + page = await browser_session.get_current_page() + return ActionResult(extracted_content=f'Manual: {text} on {page.url}') + + result = await registry.execute_action('manual_action', {'text': 'test'}, browser_session=browser_session) + + print(f'Result: {result.extracted_content}') + await browser_session.stop() + print('Manual test passed!') + + if __name__ == '__main__': + asyncio.run(manual_test()) diff --git a/tests/test_browser.py b/tests/ci/test_browser.py similarity index 100% rename from tests/test_browser.py rename to tests/ci/test_browser.py diff --git a/tests/test_browser_session.py b/tests/ci/test_browser_session.py similarity index 97% rename from tests/test_browser_session.py rename to tests/ci/test_browser_session.py index d1229561f..9df55b82d 100644 --- a/tests/test_browser_session.py +++ b/tests/ci/test_browser_session.py @@ -85,7 +85,8 @@ class TestBrowserContext: assert context1._is_url_allowed('https://anotherdomain.org/path') is True # Scenario 2: allowed_domains is provided. - allowed = ['example.com', '*.mysite.org'] + # Note: match_url_with_domain_pattern defaults to https:// scheme when none is specified + allowed = ['https://example.com', 'http://example.com', 'http://*.mysite.org', 'https://*.mysite.org'] config2 = BrowserProfile(allowed_domains=allowed) context2 = BrowserSession(browser_profile=config2) @@ -93,7 +94,7 @@ class TestBrowserContext: assert context2._is_url_allowed('http://example.com') is True # URL with subdomain (should not be allowed) assert context2._is_url_allowed('http://sub.example.com/path') is False - # URL with different domain (should not be allowed) + # URL with subdomain for wildcard pattern (should be allowed) assert context2._is_url_allowed('http://sub.mysite.org') is True # URL that matches second allowed domain assert context2._is_url_allowed('https://mysite.org/page') is True diff --git a/tests/ci/test_browser_session_param.py b/tests/ci/test_browser_session_param.py new file mode 100644 index 000000000..e88ad5d2f --- /dev/null +++ b/tests/ci/test_browser_session_param.py @@ -0,0 +1,257 @@ +""" +Test script to reproduce and debug the browser_session parameter issue with actions +like select_cell_or_range in Google Sheets. + +This test demonstrates a specific parameter passing issue that can occur in registry.execute_action +when a parameter (like browser_session) is: +1. Required by a function registered with the Registry +2. Added to extra_args by the Registry.execute_action method +3. Passed by name when the function calls another function + +The bug would manifest as: +"TypeError: select_cell_or_range() got multiple values for argument 'browser_session'" + +The fix is to pass browser_session positionally, not by name, when calling from one action to another, +to avoid the conflict when the Registry also adds it to extra_args. + +This test validates the issue exists and confirms the fix works. +""" + +import asyncio +import logging + +from pydantic import Field + +from browser_use.controller.registry.service import Registry +from browser_use.controller.registry.views import ActionModel + +# Configure logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +# Use real browser session for testing +import pytest + +from browser_use.browser import BrowserSession + + +@pytest.fixture +async def browser_session(): + """Create and provide a real BrowserSession instance.""" + browser_session = BrowserSession( + headless=True, + user_data_dir=None, + ) + await browser_session.start() + yield browser_session + await browser_session.stop() + + +# Model that doesn't include browser_session (renamed to avoid pytest collecting it) +class CellActionParams(ActionModel): + value: str = Field(description='Test value') + + +# Model that includes browser_session +class ModelWithBrowser(ActionModel): + value: str = Field(description='Test value') + browser_session: BrowserSession = None + + +# Simple context for testing +class TestContext: + pass + + +async def main(browser_session): + """Run the test to diagnose browser_session parameter issue + + This test demonstrates the problem and our fix. The issue happens because: + + 1. In controller/service.py, we have: + ```python + @registry.action('Google Sheets: Select a specific cell or range of cells') + async def select_cell_or_range(browser_session: BrowserSession, cell_or_range: str): + return await _select_cell_or_range(browser_session=browser_session, cell_or_range=cell_or_range) + ``` + + 2. When registry.execute_action calls this function, it adds browser_session to extra_args: + ```python + # In registry/service.py + if 'browser_session' in parameter_names: + extra_args['browser_session'] = browser_session + ``` + + 3. Then later, when calling action.function: + ```python + return await action.function(**params_dict, **extra_args) + ``` + + 4. This effectively means browser_session is passed twice: + - Once through extra_args['browser_session'] + - And again through params_dict['browser_session'] (from the original function) + + The fix is to pass browser_session positionally in select_cell_or_range: + ```python + return await _select_cell_or_range(browser_session, cell_or_range) + ``` + + This test confirms that this approach works. + """ + logger.info('Starting browser_session parameter test') + + # Create registry + registry = Registry[TestContext]() + + # Create a custom param model for select_cell_or_range + class CellRangeParams(ActionModel): + cell_or_range: str = Field(description='Cell or range to select') + + # Use the provided real browser session + + # Test with the real issue: select_cell_or_range + logger.info('\n\n=== Test: Simulating select_cell_or_range issue with correct model ===') + + # Define the function without using our registry - this will be a helper function + async def _select_cell_or_range(browser_session, cell_or_range): + """Helper function for select_cell_or_range""" + logger.info(f'_select_cell_or_range internal implementation called with cell_or_range={cell_or_range}') + return f'Selected cell {cell_or_range}' + + # This simulates the actual issue we're seeing in the real code + # The browser_session parameter is in both the function signature and passed as a named arg + @registry.action('Google Sheets: Select a cell or range', param_model=CellRangeParams) + async def select_cell_or_range(browser_session: BrowserSession, cell_or_range: str): + logger.info(f'select_cell_or_range called with browser_session={browser_session}, cell_or_range={cell_or_range}') + + # PROBLEMATIC LINE: browser_session is passed by name, matching the parameter name + # This is what causes the "got multiple values" error in the real code + return await _select_cell_or_range(browser_session=browser_session, cell_or_range=cell_or_range) + + # Fix attempt: Register a version that uses positional args instead + @registry.action('Google Sheets: Select a cell or range (fixed)', param_model=CellRangeParams) + async def select_cell_or_range_fixed(browser_session: BrowserSession, cell_or_range: str): + logger.info(f'select_cell_or_range_fixed called with browser_session={browser_session}, cell_or_range={cell_or_range}') + + # FIXED LINE: browser_session is passed positionally, avoiding the parameter name conflict + return await _select_cell_or_range(browser_session, cell_or_range) + + # Another attempt: explicitly call using **kwargs to simulate what the registry does + @registry.action('Google Sheets: Select with kwargs', param_model=CellRangeParams) + async def select_with_kwargs(browser_session: BrowserSession, cell_or_range: str): + logger.info(f'select_with_kwargs called with browser_session={browser_session}, cell_or_range={cell_or_range}') + + # Get params and extra_args, like in Registry.execute_action + params = {'cell_or_range': cell_or_range, 'browser_session': browser_session} + extra_args = {'browser_session': browser_session} + + # Try to call _select_cell_or_range with both params and extra_args + # This will fail with "got multiple values for keyword argument 'browser_session'" + try: + logger.info('Attempting to call with both params and extra_args (should fail):') + await _select_cell_or_range(**params, **extra_args) + except TypeError as e: + logger.info(f'Expected error: {e}') + + # Remove browser_session from params to avoid the conflict + params_fixed = dict(params) + del params_fixed['browser_session'] + + logger.info(f'Fixed params: {params_fixed}') + + # This should work + result = await _select_cell_or_range(**params_fixed, **extra_args) + logger.info(f'Success after fix: {result}') + return result + + # Test the original problematic version + logger.info('\n--- Testing original problematic version ---') + try: + result1 = await registry.execute_action( + 'select_cell_or_range', {'cell_or_range': 'A1:F100'}, browser_session=browser_session + ) + logger.info(f'Success! Result: {result1}') + except Exception as e: + logger.error(f'Error: {str(e)}') + + # Test the fixed version (using positional args) + logger.info('\n--- Testing fixed version (positional args) ---') + try: + result2 = await registry.execute_action( + 'select_cell_or_range_fixed', {'cell_or_range': 'A1:F100'}, browser_session=browser_session + ) + logger.info(f'Success! Result: {result2}') + except Exception as e: + logger.error(f'Error: {str(e)}') + + # Test with kwargs version that simulates what Registry.execute_action does + logger.info('\n--- Testing kwargs simulation version ---') + try: + result3 = await registry.execute_action( + 'select_with_kwargs', {'cell_or_range': 'A1:F100'}, browser_session=browser_session + ) + logger.info(f'Success! Result: {result3}') + except Exception as e: + logger.error(f'Error: {str(e)}') + + # Manual test of our theory: browser_session is passed twice + logger.info('\n--- Direct test of our theory ---') + try: + # Create the model instance + params = CellRangeParams(cell_or_range='A1:F100') + + # First check if the extra_args approach works + logger.info('Checking if extra_args approach works:') + extra_args = {'browser_session': browser_session} + + # If we were to modify Registry.execute_action: + # 1. Check if the function parameter needs browser_session + parameter_names = ['browser_session', 'cell_or_range'] + browser_keys = ['browser_session', 'browser', 'browser_context'] + + # Create params dict + param_dict = params.model_dump() + logger.info(f'params dict before: {param_dict}') + + # Apply our fix: remove browser_session from params dict + for key in browser_keys: + if key in param_dict and key in extra_args: + logger.info(f'Removing {key} from params dict') + del param_dict[key] + + logger.info(f'params dict after: {param_dict}') + logger.info(f'extra_args: {extra_args}') + + # This would be the fixed code: + # return await action.function(**param_dict, **extra_args) + + # Call directly to test + result3 = await select_cell_or_range(**param_dict, **extra_args) + logger.info(f'Success with our fix! Result: {result3}') + except Exception as e: + logger.error(f'Error with our manual test: {str(e)}') + + +# Add a proper pytest test function +import pytest + + +@pytest.mark.asyncio +async def test_browser_session_parameter_issue(browser_session): + """Test that the browser_session parameter issue is fixed.""" + # Run the main test logic + await main(browser_session) + + +if __name__ == '__main__': + # For direct execution (not through pytest) + async def run_with_real_browser(): + browser_session = BrowserSession(headless=True, user_data_dir=None) + await browser_session.start() + try: + await main(browser_session) + finally: + await browser_session.stop() + + asyncio.run(run_with_real_browser()) diff --git a/tests/test_controller.py b/tests/ci/test_controller.py similarity index 100% rename from tests/test_controller.py rename to tests/ci/test_controller.py diff --git a/tests/ci/test_debug_selector_map.py b/tests/ci/test_debug_selector_map.py new file mode 100644 index 000000000..55dcbc072 --- /dev/null +++ b/tests/ci/test_debug_selector_map.py @@ -0,0 +1,436 @@ +""" +Systematic debugging of the selector map issue. +Test each assumption step by step to isolate the problem. +""" + +import os + +import pytest + +from browser_use.browser import BrowserProfile, BrowserSession +from browser_use.controller.service import Controller + + +@pytest.fixture +async def browser_session(): + """Create a real browser session for testing.""" + session = BrowserSession( + browser_profile=BrowserProfile( + executable_path=os.getenv('BROWSER_PATH'), + user_data_dir=None, # Use temporary profile + headless=True, + ) + ) + async with session: + yield session + + +@pytest.fixture +def controller(): + """Create a controller instance.""" + return Controller() + + +@pytest.mark.asyncio +async def test_assumption_1_dom_processing_works(browser_session): + """Test assumption 1: DOM processing works and finds elements.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Trigger DOM processing + state = await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + + print('DOM processing result:') + print(f' - Elements found: {len(state.selector_map)}') + print(f' - Element indices: {list(state.selector_map.keys())}') + + # Verify DOM processing works + assert len(state.selector_map) > 0, 'DOM processing should find elements' + assert 0 in state.selector_map, 'Element index 0 should exist' + + +@pytest.mark.asyncio +async def test_assumption_2_cached_selector_map_persists(browser_session): + """Test assumption 2: Cached selector map persists after get_state_summary.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Trigger DOM processing and cache + state = await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + initial_selector_map = dict(state.selector_map) + + # Check if cached selector map is still available + cached_selector_map = await browser_session.get_selector_map() + + print('Selector map persistence:') + print(f' - Initial elements: {len(initial_selector_map)}') + print(f' - Cached elements: {len(cached_selector_map)}') + print(f' - Maps are identical: {initial_selector_map.keys() == cached_selector_map.keys()}') + + # Verify the cached map persists + assert len(cached_selector_map) > 0, 'Cached selector map should persist' + assert initial_selector_map.keys() == cached_selector_map.keys(), 'Cached map should match initial map' + + +@pytest.mark.asyncio +async def test_assumption_3_action_gets_same_selector_map(browser_session, controller): + """Test assumption 3: Action gets the same selector map as cached.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Trigger DOM processing and cache + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + cached_selector_map = await browser_session.get_selector_map() + + print('Pre-action state:') + print(f' - Cached elements: {len(cached_selector_map)}') + print(f' - Element 0 exists in cache: {0 in cached_selector_map}') + + # Create a test action that checks the selector map it receives + @controller.registry.action('Test: Check selector map') + async def test_check_selector_map(browser_session: BrowserSession): + from browser_use import ActionResult + + action_selector_map = await browser_session.get_selector_map() + return ActionResult( + extracted_content=f'Action sees {len(action_selector_map)} elements, index 0 exists: {0 in action_selector_map}', + include_in_memory=False, + ) + + # Execute the test action + result = await controller.registry.execute_action('test_check_selector_map', {}, browser_session=browser_session) + + print(f'Action result: {result.extracted_content}') + + # Verify the action sees the same selector map + assert 'index 0 exists: True' in result.extracted_content, 'Action should see element 0' + + +@pytest.mark.asyncio +async def test_assumption_4_click_action_specific_issue(browser_session, controller): + """Test assumption 4: Specific issue with click_element_by_index action.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Trigger DOM processing and cache + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + cached_selector_map = await browser_session.get_selector_map() + + print('Pre-click state:') + print(f' - Cached elements: {len(cached_selector_map)}') + print(f' - Element 0 exists: {0 in cached_selector_map}') + + # Create a test action that replicates click_element_by_index logic + @controller.registry.action('Test: Debug click logic') + async def test_debug_click_logic(browser_session: BrowserSession, index: int): + from browser_use import ActionResult + + # This is the exact logic from click_element_by_index + selector_map = await browser_session.get_selector_map() + + print(f' - Action selector map size: {len(selector_map)}') + print(f' - Action selector map keys: {list(selector_map.keys())[:10]}') # First 10 + print(f' - Index {index} in selector map: {index in selector_map}') + + if index not in selector_map: + return ActionResult( + error=f'Debug: Element with index {index} does not exist in map of size {len(selector_map)}', + include_in_memory=False, + ) + + return ActionResult( + extracted_content=f'Debug: Element {index} found in map of size {len(selector_map)}', include_in_memory=False + ) + + # Test with index 0 + result = await controller.registry.execute_action('test_debug_click_logic', {'index': 0}, browser_session=browser_session) + + print(f'Debug click result: {result.extracted_content or result.error}') + + # This will help us see exactly what the click action sees + if result.error: + pytest.fail(f'Click logic debug failed: {result.error}') + + +@pytest.mark.asyncio +async def test_assumption_5_multiple_get_selector_map_calls(browser_session): + """Test assumption 5: Multiple calls to get_selector_map return consistent results.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Trigger DOM processing and cache + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + + # Call get_selector_map multiple times + map1 = await browser_session.get_selector_map() + map2 = await browser_session.get_selector_map() + map3 = await browser_session.get_selector_map() + + print('Multiple selector map calls:') + print(f' - Call 1: {len(map1)} elements') + print(f' - Call 2: {len(map2)} elements') + print(f' - Call 3: {len(map3)} elements') + print(f' - All calls identical: {map1.keys() == map2.keys() == map3.keys()}') + + # Verify consistency + assert len(map1) == len(map2) == len(map3), 'Multiple calls should return same size' + assert map1.keys() == map2.keys() == map3.keys(), 'Multiple calls should return same elements' + + +@pytest.mark.asyncio +async def test_assumption_6_page_changes_affect_selector_map(browser_session): + """Test assumption 6: Check if page navigation affects cached selector map.""" + # Go to first page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Get initial selector map + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + initial_map = await browser_session.get_selector_map() + + print('Page change test:') + print(f' - Google.com elements: {len(initial_map)}') + + # Navigate to a different page (without calling get_state_summary) + await page.goto('https://www.example.com') + await page.wait_for_load_state() + + # Check if cached selector map is still from old page + cached_map_after_nav = await browser_session.get_selector_map() + + print(f' - After navigation (cached): {len(cached_map_after_nav)}') + print(f' - Cache unchanged after nav: {len(initial_map) == len(cached_map_after_nav)}') + + # Update with new page + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + new_page_map = await browser_session.get_selector_map() + + print(f' - Example.com elements (fresh): {len(new_page_map)}') + + # This will tell us if cached maps get stale + assert len(new_page_map) != len(initial_map) or initial_map.keys() != new_page_map.keys(), ( + 'Different pages should have different selector maps' + ) + + +@pytest.mark.asyncio +async def test_assumption_8_same_browser_session_instance(browser_session, controller): + """Test assumption 8: Action gets the same browser_session instance.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + print('=== BROWSER SESSION INSTANCE DEBUG ===') + + # Get fresh state + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + + # Store the ID of our browser session instance + original_session_id = id(browser_session) + print(f'1. Original browser_session ID: {original_session_id}') + print(f'2. Original cache exists: {browser_session._cached_browser_state_summary is not None}') + + # Create action that checks browser session identity + @controller.registry.action('Test: Check browser session identity') + async def test_check_session_identity(browser_session: BrowserSession): + from browser_use import ActionResult + + action_session_id = id(browser_session) + cache_exists = browser_session._cached_browser_state_summary is not None + return ActionResult( + extracted_content=f'Action session ID: {action_session_id}, Cache exists: {cache_exists}', include_in_memory=False + ) + + # Execute action + result = await controller.registry.execute_action('test_check_session_identity', {}, browser_session=browser_session) + + print(f'3. Action result: {result.extracted_content}') + + # Parse the result to check if session IDs match + action_session_id = int(result.extracted_content.split('Action session ID: ')[1].split(',')[0]) + + if original_session_id == action_session_id: + print('โœ… Same browser_session instance passed to action') + else: + print('โŒ DIFFERENT browser_session instance passed to action!') + print(f' Original: {original_session_id}') + print(f' Action: {action_session_id}') + + +@pytest.mark.asyncio +async def test_assumption_9_pydantic_private_attrs(browser_session, controller): + """Test assumption 9: Pydantic model validation affects private attributes.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + print('=== PYDANTIC PRIVATE ATTRS DEBUG ===') + + # Get fresh state + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + + print(f'1. Original browser_session cache: {browser_session._cached_browser_state_summary is not None}') + print(f'2. Original browser_session ID: {id(browser_session)}') + + # Import the SpecialActionParameters to test directly + from browser_use.controller.registry.service import SpecialActionParameters + + # Test what happens when we put browser_session through model_validate + special_params_data = { + 'context': None, + 'browser_session': browser_session, + 'browser': browser_session, + 'browser_context': browser_session, + 'page_extraction_llm': None, + 'available_file_paths': None, + 'has_sensitive_data': False, + } + + print(f'3. Before model_validate - browser_session cache: {browser_session._cached_browser_state_summary is not None}') + + # Test the fixed version using model_construct instead of model_validate + special_params = SpecialActionParameters.model_construct(**special_params_data) + + print( + f'4. After model_validate - original browser_session cache: {browser_session._cached_browser_state_summary is not None}' + ) + + # Check the browser_session that comes out of the model + extracted_browser_session = special_params.browser_session + print(f'5. Extracted browser_session ID: {id(extracted_browser_session)}') + print(f'6. Extracted browser_session cache: {extracted_browser_session._cached_browser_state_summary is not None}') + + # Check if they're the same object + if id(browser_session) == id(extracted_browser_session): + print('โœ… Same object - no copying occurred') + else: + print('โŒ DIFFERENT object - Pydantic copied the browser_session!') + + # Check if private attributes were preserved + print(f'7. Original has _cached_browser_state_summary attr: {hasattr(browser_session, "_cached_browser_state_summary")}') + print( + f'8. Extracted has _cached_browser_state_summary attr: {hasattr(extracted_browser_session, "_cached_browser_state_summary")}' + ) + + if hasattr(extracted_browser_session, '_cached_browser_state_summary'): + print(f'9. Extracted _cached_browser_state_summary value: {extracted_browser_session._cached_browser_state_summary}') + + +@pytest.mark.asyncio +async def test_assumption_7_cache_gets_cleared(browser_session, controller): + """Test assumption 7: Check if _cached_browser_state_summary gets cleared.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + print('=== CACHE CLEARING DEBUG ===') + + # Check initial cache state + print(f'1. Initial cache state: {browser_session._cached_browser_state_summary}') + + # Get fresh state + state = await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + print(f'2. After get_state_summary: cache exists = {browser_session._cached_browser_state_summary is not None}') + print(f'3. Cache has {len(state.selector_map)} elements') + + # Check cache before action + print(f'4. Pre-action cache: {browser_session._cached_browser_state_summary is not None}') + + # Create action that checks cache state (NO page parameter) + @controller.registry.action('Test: Check cache state no page') + async def test_check_cache_state_no_page(browser_session: BrowserSession): + from browser_use import ActionResult + + cache_exists = browser_session._cached_browser_state_summary is not None + if cache_exists: + cache_size = len(browser_session._cached_browser_state_summary.selector_map) + else: + cache_size = 0 + return ActionResult( + extracted_content=f'NoPage - Cache exists: {cache_exists}, Cache size: {cache_size}', include_in_memory=False + ) + + # Create action that checks cache state (WITH page parameter) + @controller.registry.action('Test: Check cache state with page') + async def test_check_cache_state_with_page(browser_session: BrowserSession, page): + from browser_use import ActionResult + + cache_exists = browser_session._cached_browser_state_summary is not None + if cache_exists: + cache_size = len(browser_session._cached_browser_state_summary.selector_map) + else: + cache_size = 0 + return ActionResult( + extracted_content=f'WithPage - Cache exists: {cache_exists}, Cache size: {cache_size}', include_in_memory=False + ) + + # Test action WITHOUT page parameter + result_no_page = await controller.registry.execute_action( + 'test_check_cache_state_no_page', {}, browser_session=browser_session + ) + + print(f'5a. Action result (NO page): {result_no_page.extracted_content}') + + # Test action WITH page parameter + result_with_page = await controller.registry.execute_action( + 'test_check_cache_state_with_page', {}, browser_session=browser_session + ) + + print(f'5b. Action result (WITH page): {result_with_page.extracted_content}') + print(f'6. Post-action cache: {browser_session._cached_browser_state_summary is not None}') + + # This will tell us if the page parameter injection clears the cache + + +@pytest.mark.asyncio +async def test_final_real_click_with_debug(browser_session, controller): + """Final test: Try actual click with maximum debugging.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + print('=== FINAL CLICK TEST WITH FULL DEBUG ===') + + # Get fresh state + state = await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + print(f'1. Fresh state has {len(state.selector_map)} elements') + + # Check cached map + cached_map = await browser_session.get_selector_map() + print(f'2. Cached map has {len(cached_map)} elements') + print(f'3. Element 0 in cached map: {0 in cached_map}') + + # Try the real click action + if 0 in cached_map: + print('4. Attempting real click_element_by_index...') + try: + result = await controller.registry.execute_action( + 'click_element_by_index', {'index': 0}, browser_session=browser_session + ) + print(f'5. Click SUCCESS: {result.extracted_content}') + except Exception as e: + print(f'5. Click FAILED: {e}') + + # Additional debug: check selector map inside the exception + debug_map = await browser_session.get_selector_map() + print(f'6. Post-failure selector map: {len(debug_map)} elements') + print(f'7. Element 0 still in map: {0 in debug_map}') + + raise e + else: + pytest.fail('Element 0 not found in cached map - test setup issue') diff --git a/tests/ci/test_google_sheets_real.py b/tests/ci/test_google_sheets_real.py new file mode 100644 index 000000000..878dd1ca6 --- /dev/null +++ b/tests/ci/test_google_sheets_real.py @@ -0,0 +1,130 @@ +""" +Real integration tests for Google Sheets actions against the actual Google Sheets website. +Tests the enhanced action registry system with Google Sheets keyboard automation. +Uses the existing Google Sheets actions from the main controller. +""" + +import os + +import pytest + +from browser_use.browser import BrowserProfile, BrowserSession +from browser_use.controller.service import Controller + +# Test Google Sheets URL (public read-only spreadsheet for testing) +TEST_GOOGLE_SHEET_URL = 'https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit' + + +@pytest.fixture +async def browser_session(): + """Create a real browser session for testing.""" + session = BrowserSession( + browser_profile=BrowserProfile( + executable_path=os.getenv('BROWSER_PATH'), + user_data_dir=None, # Use temporary profile + headless=True, + ) + ) + async with session: + yield session + + +@pytest.fixture +def controller(): + """Create a controller instance (Google Sheets actions are already registered).""" + return Controller() + + +@pytest.mark.asyncio +async def test_selector_map_basic(browser_session, controller): + """Test that the selector map gets populated on a basic page.""" + # Go to a simple page first + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Update browser state to populate selector map + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + + # Check selector map + selector_map = await browser_session.get_selector_map() + print(f'Selector map size: {len(selector_map)}') + + # Should have some elements + assert len(selector_map) > 0, 'No clickable elements found in selector map' + + +@pytest.mark.asyncio +async def test_click_element_basic(browser_session, controller): + """Test basic click element action to verify registry works.""" + # Go to a simple page + page = await browser_session.get_current_page() + await page.goto('https://www.google.com') + await page.wait_for_load_state() + + # Update browser state to populate selector map + await browser_session.get_state_summary(cache_clickable_elements_hashes=False) + + # Check selector map + selector_map = await browser_session.get_selector_map() + print(f'Available elements: {list(selector_map.keys())}') + + if len(selector_map) > 0: + # Try to click the first available element + first_index = list(selector_map.keys())[0] + print(f'Trying to click element index: {first_index}') + + result = await controller.registry.execute_action( + 'click_element_by_index', {'index': first_index}, browser_session=browser_session + ) + + # Should not have an error about element not existing + print(f'Click result: {result.extracted_content if result.extracted_content else "No content"}') + print(f'Click error: {result.error if result.error else "No error"}') + + # The click might fail for other reasons (like navigation) but shouldn't fail due to "element does not exist" + if result.error: + assert 'Element with index' not in result.error, f'Element indexing failed: {result.error}' + else: + pytest.fail('No clickable elements found - DOM processing issue') + + +@pytest.mark.asyncio +async def test_google_sheets_open(browser_session, controller): + """Test opening a Google Sheet using the existing action.""" + # First check what actions are available + available_actions = list(controller.registry.registry.actions.keys()) + print(f'Available actions: {[a for a in available_actions if "Google" in a]}') + + # Try to find the right action name + google_sheet_actions = [a for a in available_actions if 'google sheet' in a.lower()] + + if not google_sheet_actions: + pytest.skip('No Google Sheets actions found in controller') + + # Use the first Google Sheets action we find + open_action = google_sheet_actions[0] + print(f'Using action: {open_action}') + + result = await controller.registry.execute_action( + open_action, {'google_sheet_url': TEST_GOOGLE_SHEET_URL}, browser_session=browser_session + ) + + print(f'Open result: {result.extracted_content if result.extracted_content else "No content"}') + print(f'Open error: {result.error if result.error else "No error"}') + + # Verify we're on the Google Sheets page + page = await browser_session.get_current_page() + assert 'docs.google.com/spreadsheets' in page.url + + +@pytest.mark.asyncio +async def test_list_all_actions(browser_session, controller): + """Debug test to list all available actions.""" + available_actions = list(controller.registry.registry.actions.keys()) + print('All available actions:') + for action in sorted(available_actions): + print(f' - {action}') + + # Just verify the controller has some actions + assert len(available_actions) > 0 diff --git a/tests/ci/test_sensitive_data.py b/tests/ci/test_sensitive_data.py new file mode 100644 index 000000000..2bffe22f2 --- /dev/null +++ b/tests/ci/test_sensitive_data.py @@ -0,0 +1,255 @@ +import pytest +from langchain_core.messages import HumanMessage, SystemMessage +from pydantic import BaseModel, Field + +from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings +from browser_use.agent.views import MessageManagerState +from browser_use.controller.registry.service import Registry +from browser_use.utils import match_url_with_domain_pattern + + +class SensitiveParams(BaseModel): + """Test parameter model for sensitive data testing.""" + + text: str = Field(description='Text with sensitive data placeholders') + + +@pytest.fixture +def registry(): + return Registry() + + +@pytest.fixture +def message_manager(): + return MessageManager( + task='Test task', + system_message=SystemMessage(content='System message'), + settings=MessageManagerSettings(), + state=MessageManagerState(), + ) + + +def test_replace_sensitive_data_with_missing_keys(registry, caplog): + """Test that _replace_sensitive_data handles missing keys gracefully""" + # Set log level to capture warnings + import logging + + caplog.set_level(logging.WARNING) + + # Create a simple Pydantic model with sensitive data placeholders + params = SensitiveParams(text='Please enter username and password') + + # Case 1: All keys present + sensitive_data = {'username': 'user123', 'password': 'pass456'} + result = registry._replace_sensitive_data(params, sensitive_data) + assert 'user123' in result.text + assert 'pass456' in result.text + # Both keys should be replaced + assert 'Missing' not in caplog.text + caplog.clear() + + # Case 2: One key missing + sensitive_data = {'username': 'user123'} # password is missing + result = registry._replace_sensitive_data(params, sensitive_data) + assert 'user123' in result.text + assert 'password' in result.text + # Verify the behavior - username replaced, password kept as tag + assert 'password' in caplog.text + caplog.clear() + + # Case 3: Multiple keys missing + sensitive_data = {} # both keys missing + result = registry._replace_sensitive_data(params, sensitive_data) + assert 'username' in result.text + assert 'password' in result.text + # Verify both tags are preserved when keys are missing + assert 'Missing' in caplog.text + caplog.clear() + + # Case 4: One key empty + sensitive_data = {'username': 'user123', 'password': ''} + result = registry._replace_sensitive_data(params, sensitive_data) + assert 'user123' in result.text + assert 'password' in result.text + # Empty value should be treated the same as missing key + assert 'password' in caplog.text + caplog.clear() + + +def test_simple_domain_specific_sensitive_data(registry, caplog): + """Test the basic functionality of domain-specific sensitive data replacement""" + # Set log level to capture warnings + import logging + + caplog.set_level(logging.WARNING) + + # Create a simple Pydantic model with sensitive data placeholders + params = SensitiveParams(text='Please enter username and password') + + # Simple test with directly instantiable values + sensitive_data = { + 'example.com': {'username': 'example_user'}, + 'other_data': 'non_secret_value', # Old format mixed with new + } + + # Without a browser_session, it should still replace known keys + result = registry._replace_sensitive_data(params, sensitive_data) + assert 'example_user' in result.text + assert 'password' in result.text # Password is missing in sensitive_data + assert 'password' in caplog.text + caplog.clear() + + +def test_match_url_with_domain_pattern(): + """Test that the domain pattern matching utility works correctly""" + + # Test exact domain matches + assert match_url_with_domain_pattern('https://example.com', 'example.com') is True + assert match_url_with_domain_pattern('http://example.com', 'example.com') is False # Default scheme is now https + assert match_url_with_domain_pattern('https://google.com', 'example.com') is False + + # Test subdomain pattern matches + assert match_url_with_domain_pattern('https://sub.example.com', '*.example.com') is True + assert match_url_with_domain_pattern('https://example.com', '*.example.com') is True # Base domain should match too + assert match_url_with_domain_pattern('https://sub.sub.example.com', '*.example.com') is True + assert match_url_with_domain_pattern('https://example.org', '*.example.com') is False + + # Test protocol pattern matches + assert match_url_with_domain_pattern('https://example.com', 'http*://example.com') is True + assert match_url_with_domain_pattern('http://example.com', 'http*://example.com') is True + assert match_url_with_domain_pattern('ftp://example.com', 'http*://example.com') is False + + # Test explicit http protocol + assert match_url_with_domain_pattern('http://example.com', 'http://example.com') is True + assert match_url_with_domain_pattern('https://example.com', 'http://example.com') is False + + # Test Chrome extension pattern + assert match_url_with_domain_pattern('chrome-extension://abcdefghijkl', 'chrome-extension://*') is True + assert match_url_with_domain_pattern('chrome-extension://mnopqrstuvwx', 'chrome-extension://abcdefghijkl') is False + + # Test about:blank handling + assert match_url_with_domain_pattern('about:blank', 'example.com') is False + assert match_url_with_domain_pattern('about:blank', '*://*') is False + + +def test_unsafe_domain_patterns(): + """Test that unsafe domain patterns are rejected""" + + # These are unsafe patterns that could match too many domains + assert match_url_with_domain_pattern('https://evil.com', '*google.com') is False + assert match_url_with_domain_pattern('https://google.com.evil.com', '*.*.com') is False + assert match_url_with_domain_pattern('https://google.com', '**google.com') is False + assert match_url_with_domain_pattern('https://google.com', 'g*e.com') is False + assert match_url_with_domain_pattern('https://google.com', '*com*') is False + + # Test with patterns that have multiple asterisks in different positions + assert match_url_with_domain_pattern('https://subdomain.example.com', '*domain*example*') is False + assert match_url_with_domain_pattern('https://sub.domain.example.com', '*.*.example.com') is False + + # Test patterns with wildcards in TLD part + assert match_url_with_domain_pattern('https://example.com', 'example.*') is False + assert match_url_with_domain_pattern('https://example.org', 'example.*') is False + + +def test_malformed_urls_and_patterns(): + """Test handling of malformed URLs or patterns""" + + # Malformed URLs + assert match_url_with_domain_pattern('not-a-url', 'example.com') is False + assert match_url_with_domain_pattern('http://', 'example.com') is False + assert match_url_with_domain_pattern('https://', 'example.com') is False + assert match_url_with_domain_pattern('ftp:/example.com', 'example.com') is False # Missing slash + + # Empty URLs or patterns + assert match_url_with_domain_pattern('', 'example.com') is False + assert match_url_with_domain_pattern('https://example.com', '') is False + + # URLs with no hostname + assert match_url_with_domain_pattern('file:///path/to/file.txt', 'example.com') is False + + # Invalid pattern formats + assert match_url_with_domain_pattern('https://example.com', '..example.com') is False + assert match_url_with_domain_pattern('https://example.com', '.*.example.com') is False + assert match_url_with_domain_pattern('https://example.com', '**') is False + + # Nested URL attacks in path, query or fragments + assert match_url_with_domain_pattern('https://example.com/redirect?url=https://evil.com', 'example.com') is True + assert match_url_with_domain_pattern('https://example.com/path/https://evil.com', 'example.com') is True + assert match_url_with_domain_pattern('https://example.com#https://evil.com', 'example.com') is True + # These should match example.com, not evil.com since urlparse extracts the hostname correctly + + # Complex URL obfuscation attempts + assert match_url_with_domain_pattern('https://example.com/path?next=//evil.com/attack', 'example.com') is True + assert match_url_with_domain_pattern('https://example.com@evil.com', 'example.com') is False + assert match_url_with_domain_pattern('https://evil.com?example.com', 'example.com') is False + assert match_url_with_domain_pattern('https://user:example.com@evil.com', 'example.com') is False + # urlparse correctly identifies evil.com as the hostname in these cases + + +def test_url_components(): + """Test handling of URL components like credentials, ports, fragments, etc.""" + + # URLs with credentials (username:password@) + assert match_url_with_domain_pattern('https://user:pass@example.com', 'example.com') is True + assert match_url_with_domain_pattern('https://user:pass@example.com', '*.example.com') is True + + # URLs with ports + assert match_url_with_domain_pattern('https://example.com:8080', 'example.com') is True + assert match_url_with_domain_pattern('https://example.com:8080', 'example.com:8080') is True # Port is stripped from pattern + + # URLs with paths + assert match_url_with_domain_pattern('https://example.com/path/to/page', 'example.com') is True + assert ( + match_url_with_domain_pattern('https://example.com/path/to/page', 'example.com/path') is False + ) # Paths in patterns are not supported + + # URLs with query parameters + assert match_url_with_domain_pattern('https://example.com?param=value', 'example.com') is True + + # URLs with fragments + assert match_url_with_domain_pattern('https://example.com#section', 'example.com') is True + + # URLs with all components + assert match_url_with_domain_pattern('https://user:pass@example.com:8080/path?query=val#fragment', 'example.com') is True + + +def test_filter_sensitive_data(message_manager): + """Test that _filter_sensitive_data handles all sensitive data scenarios correctly""" + # Set up a message with sensitive information + message = HumanMessage(content='My username is admin and password is secret123') + + # Case 1: No sensitive data provided + message_manager.settings.sensitive_data = None + result = message_manager._filter_sensitive_data(message) + assert result.content == 'My username is admin and password is secret123' + + # Case 2: All sensitive data is properly replaced + message_manager.settings.sensitive_data = {'username': 'admin', 'password': 'secret123'} + result = message_manager._filter_sensitive_data(message) + assert 'username' in result.content + assert 'password' in result.content + + # Case 3: Make sure it works with nested content + nested_message = HumanMessage(content=[{'type': 'text', 'text': 'My username is admin and password is secret123'}]) + result = message_manager._filter_sensitive_data(nested_message) + assert 'username' in result.content[0]['text'] + assert 'password' in result.content[0]['text'] + + # Case 4: Test with empty values + message_manager.settings.sensitive_data = {'username': 'admin', 'password': ''} + result = message_manager._filter_sensitive_data(message) + assert 'username' in result.content + # Only username should be replaced since password is empty + + # Case 5: Test with domain-specific sensitive data format + message_manager.settings.sensitive_data = { + 'example.com': {'username': 'admin', 'password': 'secret123'}, + 'google.com': {'email': 'user@example.com', 'password': 'google_pass'}, + } + # Update the message to include the values we're going to test + message = HumanMessage(content='My username is admin, email is user@example.com and password is secret123 or google_pass') + result = message_manager._filter_sensitive_data(message) + # All sensitive values should be replaced regardless of domain + assert 'username' in result.content + assert 'password' in result.content + assert 'email' in result.content diff --git a/tests/test_tab_management.py b/tests/ci/test_tab_management.py similarity index 97% rename from tests/test_tab_management.py rename to tests/ci/test_tab_management.py index 1f40f41d0..3e5517a65 100644 --- a/tests/test_tab_management.py +++ b/tests/ci/test_tab_management.py @@ -203,24 +203,28 @@ class TestTabManagement: """Test that agent_current_page changes and human_current_page remains the same when a new tab is opened.""" initial_tab = await self._reset_tab_state(browser_session, base_url) - assert initial_tab.url == 'about:blank' + await initial_tab.goto(f'{base_url}/page1') + await self._simulate_human_tab_change(initial_tab, browser_session) + assert initial_tab.url == f'{base_url}/page1' initial_tab_count = len(browser_session.tabs) assert initial_tab_count == 1 # test opening a new tab new_tab = await browser_session.create_new_tab(f'{base_url}/page2') new_tab_count = len(browser_session.browser_context.pages) - assert new_tab_count == len(browser_session.tabs) == 2 + assert ( + new_tab_count == len(browser_session.tabs) == 2 + ) # get_current_page/create_new_tab should have auto-closed unused about:blank pages # test agent open new tab updates agent focus + doesn't steal human focus assert browser_session.agent_current_page.url == new_tab.url == f'{base_url}/page2' - assert browser_session.human_current_page.url == initial_tab.url == 'about:blank' + assert browser_session.human_current_page.url == initial_tab.url == f'{base_url}/page1' # test agent navigation updates agent focus +doesn't steal human focus await browser_session.navigate(f'{base_url}/page3') assert browser_session.agent_current_page.url == f'{base_url}/page3' # agent should now be on the new tab assert ( - browser_session.human_current_page.url == initial_tab.url == 'about:blank' + browser_session.human_current_page.url == initial_tab.url == f'{base_url}/page1' ) # human should still be on the very first tab @pytest.mark.asyncio diff --git a/tests/test_url_allowlist_security.py b/tests/ci/test_url_allowlist_security.py similarity index 78% rename from tests/test_url_allowlist_security.py rename to tests/ci/test_url_allowlist_security.py index 70e1146b9..a157243eb 100644 --- a/tests/test_url_allowlist_security.py +++ b/tests/ci/test_url_allowlist_security.py @@ -38,23 +38,31 @@ class TestUrlAllowlistSecurity: assert browser_session._is_url_allowed('https://example.org') is False # Test more complex glob patterns - browser_profile = BrowserProfile(allowed_domains=['*google.com', 'wiki*']) + browser_profile = BrowserProfile( + allowed_domains=['*.google.com', 'https://wiki.org', 'https://good.com', 'chrome://version', 'brave://*'] + ) browser_session = BrowserSession(browser_profile=browser_profile) # Should match domains ending with google.com assert browser_session._is_url_allowed('https://google.com') is True assert browser_session._is_url_allowed('https://www.google.com') is True - assert browser_session._is_url_allowed('https://anygoogle.com') is True + assert ( + browser_session._is_url_allowed('https://evilgood.com') is False + ) # make sure we dont allow *good.com patterns, only *.good.com # Should match domains starting with wiki + assert browser_session._is_url_allowed('http://wiki.org') is False assert browser_session._is_url_allowed('https://wiki.org') is True - assert browser_session._is_url_allowed('https://wikipedia.org') is True - # Should not match other domains - assert browser_session._is_url_allowed('https://example.com') is False + # Should not match internal domains because scheme was not provided + assert browser_session._is_url_allowed('chrome://google.com') is False + assert browser_session._is_url_allowed('chrome://abc.google.com') is False # Test browser internal URLs - assert browser_session._is_url_allowed('chrome://settings') is True + assert browser_session._is_url_allowed('chrome://settings') is False + assert browser_session._is_url_allowed('chrome://version') is True + assert browser_session._is_url_allowed('chrome-extension://version/') is False + assert browser_session._is_url_allowed('brave://anything/') is True assert browser_session._is_url_allowed('about:blank') is True # Test security for glob patterns (authentication credentials bypass attempts) @@ -67,7 +75,7 @@ class TestUrlAllowlistSecurity: def test_glob_pattern_edge_cases(self): """Test edge cases for glob pattern matching to ensure proper behavior.""" # Test with domains containing glob pattern in the middle - browser_profile = BrowserProfile(allowed_domains=['*google.com', 'wiki*']) + browser_profile = BrowserProfile(allowed_domains=['*.google.com', 'https://wiki.org']) browser_session = BrowserSession(browser_profile=browser_profile) # Verify that 'wiki*' pattern doesn't match domains that merely contain 'wiki' in the middle @@ -79,13 +87,13 @@ class TestUrlAllowlistSecurity: assert browser_session._is_url_allowed('https://mygoogle.company.com') is False # Create context with potentially risky glob pattern that demonstrates security concerns - browser_profile = BrowserProfile(allowed_domains=['*.google.*']) + browser_profile = BrowserProfile(allowed_domains=['*.google.com', '*.google.co.uk']) browser_session = BrowserSession(browser_profile=browser_profile) # Should match legitimate Google domains assert browser_session._is_url_allowed('https://www.google.com') is True assert browser_session._is_url_allowed('https://mail.google.co.uk') is True - # But could also match potentially malicious domains with a subdomain structure - # This demonstrates why such wildcard patterns can be risky - assert browser_session._is_url_allowed('https://www.google.evil.com') is True + # Shouldn't match potentially malicious domains with a similar structure + # This demonstrates why the previous pattern was risky and why it's now rejected + assert browser_session._is_url_allowed('https://www.google.evil.com') is False diff --git a/tests/test_action_params.py b/tests/test_action_params.py new file mode 100644 index 000000000..c0594c264 --- /dev/null +++ b/tests/test_action_params.py @@ -0,0 +1,91 @@ +import asyncio +import logging +from inspect import signature + +import pytest +from pydantic import BaseModel, Field + +from browser_use.browser import BrowserSession +from browser_use.controller.registry.service import Registry +from browser_use.controller.registry.views import ActionModel + +# Configure logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +# Test model +class TestActionParams(ActionModel): + value: str = Field(description='Test value') + + +# Our Context type for the Registry +class TestContext: + def __init__(self, value): + self.value = value + + +@pytest.mark.asyncio +async def test_registry_param_handling(): + """Test how Registry handles parameter passing for different function signatures.""" + # Create a Registry instance + registry = Registry[TestContext]() + + # Create test functions with different signatures + + # 1. Function with browser_session as a positional parameter + @registry.action('Test action with browser_session', param_model=TestActionParams) + async def action_with_browser_session(params: TestActionParams, browser_session: BrowserSession): + logger.debug(f'action_with_browser_session called with params={params}, browser_session={browser_session}') + return {'params': params.model_dump(), 'has_browser': browser_session is not None} + + # 2. Function with browser_session in the model + class ModelWithBrowserSession(BaseModel): + value: str + browser_session: BrowserSession = None + + @registry.action('Test action with browser_session in model') + async def action_with_browser_in_model(params: ModelWithBrowserSession): + logger.debug(f'action_with_browser_in_model called with params={params}') + return {'params': params.model_dump(), 'has_browser': params.browser_session is not None} + + # 3. Function using **kwargs + @registry.action('Test action with kwargs') + async def action_with_kwargs(params: TestActionParams, **kwargs): + logger.debug(f'action_with_kwargs called with params={params}, kwargs={kwargs}') + return {'params': params.model_dump(), 'kwargs': kwargs} + + # Create a mock browser session + mock_browser_session = object() # Just a placeholder + + # Execute the actions + logger.debug('\n\n=== Testing action_with_browser_session ===') + result1 = await registry.execute_action( + 'action_with_browser_session', {'value': 'test1'}, browser_session=mock_browser_session + ) + logger.debug(f'Result: {result1}') + + logger.debug('\n\n=== Testing action_with_browser_in_model ===') + result2 = await registry.execute_action( + 'action_with_browser_in_model', + {'value': 'test2', 'browser_session': None}, # Browser session in model is None + browser_session=mock_browser_session, # Browser session in execute_action is provided + ) + logger.debug(f'Result: {result2}') + + logger.debug('\n\n=== Testing action_with_kwargs ===') + result3 = await registry.execute_action('action_with_kwargs', {'value': 'test3'}, browser_session=mock_browser_session) + logger.debug(f'Result: {result3}') + + # Print all signatures + logger.debug('\n\n=== Function Signatures ===') + logger.debug(f'action_with_browser_session: {signature(action_with_browser_session)}') + logger.debug(f'action_with_browser_in_model: {signature(action_with_browser_in_model)}') + logger.debug(f'action_with_kwargs: {signature(action_with_kwargs)}') + + return result1, result2, result3 + + +if __name__ == '__main__': + # Run the test + asyncio.run(test_registry_param_handling()) diff --git a/tests/test_browser_config_models.py b/tests/test_browser_config_models.py index 02d2279bd..a32852138 100644 --- a/tests/test_browser_config_models.py +++ b/tests/test_browser_config_models.py @@ -12,9 +12,7 @@ async def test_proxy_settings_pydantic_model(): Test that ProxySettings as a Pydantic model is correctly converted to a dictionary when used. """ # Create ProxySettings with Pydantic model - proxy_settings = ProxySettings( - server='http://example.proxy:8080', bypass='localhost', username='testuser', password='testpass' - ) + proxy_settings = dict(server='http://example.proxy:8080', bypass='localhost', username='testuser', password='testpass') # Verify the model has correct dict-like access assert proxy_settings['server'] == 'http://example.proxy:8080' @@ -22,7 +20,7 @@ async def test_proxy_settings_pydantic_model(): assert proxy_settings.get('nonexistent', 'default') == 'default' # Verify model_dump works correctly - proxy_dict = proxy_settings.model_dump() + proxy_dict = dict(proxy_settings) assert isinstance(proxy_dict, dict) assert proxy_dict['server'] == 'http://example.proxy:8080' assert proxy_dict['bypass'] == 'localhost' diff --git a/tests/test_sensitive_data.py b/tests/test_sensitive_data.py deleted file mode 100644 index 89f722bb0..000000000 --- a/tests/test_sensitive_data.py +++ /dev/null @@ -1,91 +0,0 @@ -import pytest -from langchain_core.messages import HumanMessage, SystemMessage -from pydantic import BaseModel, Field - -from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings -from browser_use.agent.views import MessageManagerState -from browser_use.controller.registry.service import Registry - - -class SensitiveParams(BaseModel): - """Test parameter model for sensitive data testing.""" - - text: str = Field(description='Text with sensitive data placeholders') - - -@pytest.fixture -def registry(): - return Registry() - - -@pytest.fixture -def message_manager(): - return MessageManager( - task='Test task', - system_message=SystemMessage(content='System message'), - settings=MessageManagerSettings(), - state=MessageManagerState(), - ) - - -def test_replace_sensitive_data_with_missing_keys(registry): - """Test that _replace_sensitive_data handles missing keys gracefully""" - # Create a simple Pydantic model with sensitive data placeholders - params = SensitiveParams(text='Please enter username and password') - - # Case 1: All keys present - sensitive_data = {'username': 'user123', 'password': 'pass456'} - result = registry._replace_sensitive_data(params, sensitive_data) - assert 'user123' in result.text - assert 'pass456' in result.text - # Both keys should be replaced - - # Case 2: One key missing - sensitive_data = {'username': 'user123'} # password is missing - result = registry._replace_sensitive_data(params, sensitive_data) - assert 'user123' in result.text - assert 'password' in result.text - # Verify the behavior - username replaced, password kept as tag - - # Case 3: Multiple keys missing - sensitive_data = {} # both keys missing - result = registry._replace_sensitive_data(params, sensitive_data) - assert 'username' in result.text - assert 'password' in result.text - # Verify both tags are preserved when keys are missing - - # Case 4: One key empty - sensitive_data = {'username': 'user123', 'password': ''} - result = registry._replace_sensitive_data(params, sensitive_data) - assert 'user123' in result.text - assert 'password' in result.text - # Empty value should be treated the same as missing key - - -def test_filter_sensitive_data(message_manager): - """Test that _filter_sensitive_data handles all sensitive data scenarios correctly""" - # Set up a message with sensitive information - message = HumanMessage(content='My username is admin and password is secret123') - - # Case 1: No sensitive data provided - message_manager.settings.sensitive_data = None - result = message_manager._filter_sensitive_data(message) - assert result.content == 'My username is admin and password is secret123' - - # Case 2: All sensitive data is properly replaced - message_manager.settings.sensitive_data = {'username': 'admin', 'password': 'secret123'} - result = message_manager._filter_sensitive_data(message) - assert 'username' in result.content - assert 'password' in result.content - - # Case 3: Make sure it works with nested content - nested_message = HumanMessage(content=[{'type': 'text', 'text': 'My username is admin and password is secret123'}]) - result = message_manager._filter_sensitive_data(nested_message) - assert 'username' in result.content[0]['text'] - assert 'password' in result.content[0]['text'] - - # Case 4: Test with empty values - message_manager.settings.sensitive_data = {'username': 'admin', 'password': ''} - result = message_manager._filter_sensitive_data(message) - assert 'username' in result.content - # Only username should be replaced since password is empty