diff --git a/.github/workflows/cloud_evals.yml b/.github/workflows/cloud_evals.yml
index 33d5f75c3..9dd97f482 100644
--- a/.github/workflows/cloud_evals.yml
+++ b/.github/workflows/cloud_evals.yml
@@ -16,6 +16,8 @@ on:
description: Commit hash of the library to build the Cloud eval image for
required: false
+permissions: {}
+
jobs:
trigger_cloud_eval_image_build:
runs-on: ubuntu-latest
diff --git a/.github/workflows/install-script.yml b/.github/workflows/install-script.yml
index ccc3316fa..3ab8ed047 100644
--- a/.github/workflows/install-script.yml
+++ b/.github/workflows/install-script.yml
@@ -13,6 +13,9 @@ on:
- '.github/workflows/install-script.yml'
workflow_dispatch:
+permissions:
+ contents: read
+
# Cancel in-progress runs when a new commit is pushed
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -26,16 +29,15 @@ env:
jobs:
# ===========================================================================
- # Test install.sh with different modes on all platforms
+ # Test install.sh on all platforms
# ===========================================================================
test-install-sh-linux:
- name: install.sh ${{ matrix.mode }} (Linux ${{ matrix.os }})
+ name: install.sh (Linux ${{ matrix.os }})
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, ubuntu-22.04]
- mode: [--remote-only, --local-only, --full]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
@@ -45,8 +47,8 @@ jobs:
with:
python-version: '3.11'
- - name: Run install.sh ${{ matrix.mode }}
- run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }}
+ - name: Run install.sh
+ run: bash browser_use/skill_cli/install.sh
- name: Add to PATH
run: |
@@ -58,65 +60,31 @@ jobs:
source ~/.browser-use-env/bin/activate
browser-use --help
- - name: Verify install-config.json
- run: |
- cat ~/.browser-use/install-config.json
- # Verify expected modes based on install flag
- if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then
- grep -q '"remote"' ~/.browser-use/install-config.json
- grep -q '"default_mode": "remote"' ~/.browser-use/install-config.json
- elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then
- grep -q '"chromium"' ~/.browser-use/install-config.json
- grep -q '"default_mode": "chromium"' ~/.browser-use/install-config.json
- elif [[ "${{ matrix.mode }}" == "--full" ]]; then
- grep -q '"chromium"' ~/.browser-use/install-config.json
- grep -q '"remote"' ~/.browser-use/install-config.json
- fi
-
- - name: Verify Chromium installed (local/full only)
- if: matrix.mode != '--remote-only'
+ - name: Verify Chromium installed
run: |
source ~/.browser-use-env/bin/activate
- # Check playwright browsers are installed
- uvx playwright install --dry-run chromium 2>&1 | grep -i "chromium" || true
# Verify chromium binary exists in playwright cache
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chrome 2>/dev/null || \
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium binary check completed"
- - name: Verify cloudflared installed (remote/full only)
- if: matrix.mode != '--local-only'
- run: |
- which cloudflared || ls ~/.local/bin/cloudflared
- cloudflared --version
-
- - name: Verify cloudflared NOT installed (local-only)
- if: matrix.mode == '--local-only'
- run: |
- if command -v cloudflared &> /dev/null; then
- echo "ERROR: cloudflared should not be installed in local-only mode"
- exit 1
- fi
- echo "Confirmed: cloudflared not installed (expected for local-only)"
-
- name: Run browser-use doctor
run: |
source ~/.browser-use-env/bin/activate
browser-use doctor
test-install-sh-macos:
- name: install.sh ${{ matrix.mode }} (macOS ${{ matrix.os }})
+ name: install.sh (macOS ${{ matrix.os }})
strategy:
fail-fast: false
matrix:
os: [macos-latest, macos-14]
- mode: [--remote-only, --local-only, --full]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- - name: Run install.sh ${{ matrix.mode }}
- run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }}
+ - name: Run install.sh
+ run: bash browser_use/skill_cli/install.sh
- name: Add to PATH
run: |
@@ -128,22 +96,7 @@ jobs:
source ~/.browser-use-env/bin/activate
browser-use --help
- - name: Verify install-config.json
- run: |
- cat ~/.browser-use/install-config.json
- if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then
- grep -q '"remote"' ~/.browser-use/install-config.json
- grep -q '"default_mode": "remote"' ~/.browser-use/install-config.json
- elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then
- grep -q '"chromium"' ~/.browser-use/install-config.json
- grep -q '"default_mode": "chromium"' ~/.browser-use/install-config.json
- elif [[ "${{ matrix.mode }}" == "--full" ]]; then
- grep -q '"chromium"' ~/.browser-use/install-config.json
- grep -q '"remote"' ~/.browser-use/install-config.json
- fi
-
- - name: Verify Chromium installed (local/full only)
- if: matrix.mode != '--remote-only'
+ - name: Verify Chromium installed
run: |
source ~/.browser-use-env/bin/activate
# Check playwright cache for chromium
@@ -151,32 +104,13 @@ jobs:
ls ~/Library/Caches/ms-playwright/chromium-*/Chromium.app 2>/dev/null || \
echo "Chromium binary check completed"
- - name: Verify cloudflared installed (remote/full only)
- if: matrix.mode != '--local-only'
- run: |
- which cloudflared || ls ~/.local/bin/cloudflared
- cloudflared --version
-
- - name: Verify cloudflared NOT installed (local-only)
- if: matrix.mode == '--local-only'
- run: |
- if command -v cloudflared &> /dev/null; then
- echo "ERROR: cloudflared should not be installed in local-only mode"
- exit 1
- fi
- echo "Confirmed: cloudflared not installed (expected for local-only)"
-
- name: Run browser-use doctor
run: |
source ~/.browser-use-env/bin/activate
browser-use doctor
test-install-sh-windows:
- name: install.sh ${{ matrix.mode }} (Windows)
- strategy:
- fail-fast: false
- matrix:
- mode: [--remote-only, --local-only, --full]
+ name: install.sh (Windows)
runs-on: windows-latest
defaults:
run:
@@ -192,8 +126,8 @@ jobs:
with:
python-version: '3.11'
- - name: Run install.sh ${{ matrix.mode }}
- run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }}
+ - name: Run install.sh
+ run: bash browser_use/skill_cli/install.sh
- name: Add to PATH
run: |
@@ -205,18 +139,6 @@ jobs:
source ~/.browser-use-env/Scripts/activate
browser-use --help
- - name: Verify install-config.json
- run: |
- cat ~/.browser-use/install-config.json
- if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then
- grep -q '"remote"' ~/.browser-use/install-config.json
- elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then
- grep -q '"chromium"' ~/.browser-use/install-config.json
- elif [[ "${{ matrix.mode }}" == "--full" ]]; then
- grep -q '"chromium"' ~/.browser-use/install-config.json
- grep -q '"remote"' ~/.browser-use/install-config.json
- fi
-
- name: Run browser-use doctor
run: |
source ~/.browser-use-env/Scripts/activate
@@ -245,7 +167,7 @@ jobs:
# Install from current branch
uv pip install .
- - name: Run browser-use install (installs Chromium only, not cloudflared)
+ - name: Run browser-use install (installs Chromium)
run: |
source .venv/bin/activate
browser-use install
@@ -262,9 +184,6 @@ jobs:
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium check completed"
- # Note: browser-use install only installs Chromium, not cloudflared
- # Users should install cloudflared separately if needed for tunneling
-
- name: Run browser-use doctor
run: |
source .venv/bin/activate
@@ -295,7 +214,6 @@ jobs:
- name: Test uvx with local wheel
run: |
- # Install the wheel we just built
WHEEL=$(ls dist/*.whl)
uvx --from "$WHEEL" browser-use --help
@@ -310,8 +228,6 @@ jobs:
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium check completed"
- # Note: browser-use install only installs Chromium, not cloudflared
-
- name: Test uvx browser-use doctor
run: |
WHEEL=$(ls dist/*.whl)
@@ -345,7 +261,5 @@ jobs:
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium check completed"
- # Note: browser-use install only installs Chromium, not cloudflared
-
- name: Test uvx browser-use doctor
run: uvx "browser-use[cli]" doctor
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index c40046dee..af70548a8 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -16,6 +16,9 @@ on:
pull_request:
workflow_dispatch:
+permissions:
+ contents: read
+
jobs:
lint-syntax:
name: syntax-errors
@@ -35,7 +38,8 @@ jobs:
- uses: astral-sh/setup-uv@v5
with:
enable-cache: true
- - run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors
+ - run: uv python install 3.11
+ - run: uv sync --dev --all-extras --python 3.11
- run: uv run --no-sync pre-commit run --all-files --show-diff-on-failure
lint-typecheck:
diff --git a/.github/workflows/package.yaml b/.github/workflows/package.yaml
index 981d783f9..cd9eb91af 100644
--- a/.github/workflows/package.yaml
+++ b/.github/workflows/package.yaml
@@ -15,6 +15,9 @@ on:
- '*'
workflow_dispatch:
+permissions:
+ contents: read
+
jobs:
build:
name: pip-build
diff --git a/.github/workflows/stale-bot.yml b/.github/workflows/stale-bot.yml
index 779080e0e..ac943c73b 100644
--- a/.github/workflows/stale-bot.yml
+++ b/.github/workflows/stale-bot.yml
@@ -12,7 +12,7 @@ jobs:
stale:
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@v9
+ - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9
with:
# General settings
repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d3bb348bc..597c4344d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,6 @@
+default_language_version:
+ python: python3.11
+
repos:
- repo: https://github.com/asottile/yesqa
rev: v1.5.0
diff --git a/AGENTS.md b/AGENTS.md
index 4e370fa53..1d71f5d2e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -36,7 +36,7 @@ uv sync
To get started with Browser Use you need to install the package and create an `.env` file with your API key.
- `ChatBrowserUse` offers the [fastest and most cost-effective models](https://browser-use.com/posts/speed-matters/), completing tasks 3-5x faster. Get started with \$10 of [free LLM credits](https://cloud.browser-use.com/new-api-key).
+ `ChatBrowserUse` offers the [fastest and most cost-effective models](https://browser-use.com/posts/speed-matters/), completing tasks 3-5x faster. Get your API key at [cloud.browser-use.com](https://cloud.browser-use.com/new-api-key).
## 1. Installing Browser-Use
@@ -61,7 +61,7 @@ uvx browser-use install
Create a `.env` file and add your API key.
- We recommend using ChatBrowserUse which is optimized for browser automation tasks (highest accuracy + fastest speed + lowest token cost). Don't have one? We give you **\$10** to try it out [here](https://cloud.browser-use.com/new-api-key).
+ We recommend using ChatBrowserUse which is optimized for browser automation tasks (highest accuracy + fastest speed + lowest token cost). Get your API key [here](https://cloud.browser-use.com/new-api-key).
```bash .env theme={null}
@@ -76,7 +76,7 @@ Then add your API key to the file.
```bash Browser Use theme={null}
# add your key to .env file
BROWSER_USE_API_KEY=
- # Get 10$ of free credits at https://cloud.browser-use.com/new-api-key
+ # Get your API key at https://cloud.browser-use.com/new-api-key
```
```bash Google theme={null}
@@ -256,7 +256,7 @@ Your cloud browser is already logged in!
***
-For more sandbox parameters and events, see [Sandbox Quickstart](https://docs.browser-use.com/customize/sandbox/quickstart).
+For more sandbox parameters and events, see [Sandbox Quickstart](https://docs.browser-use.com/legacy/sandbox/quickstart).
# Agent Basics
```python theme={null}
@@ -538,7 +538,7 @@ async def main():
> Complete reference for all browser configuration options
- The `Browser` instance also provides all [Actor](https://docs.browser-use.com/customize/actor/all-parameters) methods for direct browser control (page management, element interactions, etc.).
+ The `Browser` instance also provides all [Actor](https://docs.browser-use.com/legacy/actor/all-parameters) methods for direct browser control (page management, element interactions, etc.).
## Core Settings
@@ -798,7 +798,7 @@ The agent injects parameters by name matching, so using the wrong name will caus
- Use `browser_session` parameter in tools for deterministic [Actor](https://docs.browser-use.com/customize/actor/basics) actions.
+ Use `browser_session` parameter in tools for deterministic [Actor](https://docs.browser-use.com/legacy/actor/basics) actions.
diff --git a/CLOUD.md b/CLOUD.md
index e4e3dac81..9e8ee0487 100644
--- a/CLOUD.md
+++ b/CLOUD.md
@@ -19,7 +19,7 @@ The key product of Browser Use Cloud is the completion of user tasks.
- Profile Sync is the best way to handle authentication for tasks. This feature allows users to upload their local browser cookies (where the user is already logged into the services they need authentication for) to a Browser Profile that can be used for tasks on the cloud. To initiate a Profile Sync, a user must run `export BROWSER_USE_API_KEY= && curl -fsSL https://browser-use.com/profile.sh | sh` and follow the steps in the interactive terminal.
## Quickstart
-To get started, direct the user to first must create an account, purchase credits (or simply claim the free starter credits given on account creation), and generate an API key on the Browser Use online platform: https://cloud.browser-use.com/. These are the only steps that can only be done on the platform.
+To get started, direct the user to first must create an account, purchase credits (or simply claim the five free tasks given on account creation), and generate an API key on the Browser Use online platform: https://cloud.browser-use.com/. These are the only steps that can only be done on the platform.
Avoid giving the user all of the following steps at once as it may seem overwheling. Instead present one step at a time and only continue when asked. Do as much for the user as you are able to.
diff --git a/README.md b/README.md
index 0c64a69be..0b2da4b6f 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
-
+
---
@@ -33,12 +33,12 @@
-
+
-🌤️ Want to skip the setup? Use our [cloud](https://cloud.browser-use.com) for faster, scalable, stealth-enabled browser automation!
+🌤️ Want to skip the setup? Use our [cloud](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-skip-setup) for faster, scalable, stealth-enabled browser automation!
# 🤖 LLM Quickstart
@@ -49,77 +49,99 @@
# 👋 Human Quickstart
-**1. Create environment with [uv](https://docs.astral.sh/uv/) (Python>=3.11):**
+**1. Create environment and install Browser-Use with [uv](https://docs.astral.sh/uv/) (Python>=3.11):**
```bash
-uv init
+uv init && uv add browser-use && uv sync
+# uvx browser-use install # Run if you don't have Chromium installed
```
-**2. Install Browser-Use package:**
-```bash
-# We ship every day - use the latest version!
-uv add browser-use
-uv sync
-```
-
-**3. Get your API key from [Browser Use Cloud](https://cloud.browser-use.com/new-api-key) and add it to your `.env` file (new signups get $10 free credits):**
+**2. [Optional] Get your API key from [Browser Use Cloud](https://cloud.browser-use.com/new-api-key?utm_source=github&utm_medium=readme-quickstart-api-key):**
```
# .env
BROWSER_USE_API_KEY=your-key
+# GOOGLE_API_KEY=your-key
+# ANTHROPIC_API_KEY=your-key
```
-**4. Install Chromium browser:**
-```bash
-uvx browser-use install
-```
-
-**5. Run your first agent:**
+**3. Run your first agent:**
```python
from browser_use import Agent, Browser, ChatBrowserUse
+# from browser_use import ChatGoogle # ChatGoogle(model='gemini-3-flash-preview')
+# from browser_use import ChatAnthropic # ChatAnthropic(model='claude-sonnet-4-6')
import asyncio
-async def example():
+async def main():
browser = Browser(
- # use_cloud=True, # Uncomment to use a stealth browser on Browser Use Cloud
+ # use_cloud=True, # Use a stealth browser on Browser Use Cloud
)
- llm = ChatBrowserUse()
-
agent = Agent(
task="Find the number of stars of the browser-use repo",
- llm=llm,
+ llm=ChatBrowserUse(),
+ # llm=ChatGoogle(model='gemini-3-flash-preview'),
+ # llm=ChatAnthropic(model='claude-sonnet-4-6'),
browser=browser,
)
-
- history = await agent.run()
- return history
+ await agent.run()
if __name__ == "__main__":
- history = asyncio.run(example())
+ asyncio.run(main())
```
-Check out the [library docs](https://docs.browser-use.com) and the [cloud docs](https://docs.cloud.browser-use.com) for more!
+Check out the [library docs](https://docs.browser-use.com/open-source/introduction) and the [cloud docs](https://docs.cloud.browser-use.com?utm_source=github&utm_medium=readme-cloud-docs) for more!
-# 🔥 Deploy on Sandboxes
+# Open Source vs Cloud
-We handle agents, browsers, persistence, auth, cookies, and LLMs. The agent runs right next to the browser for minimal latency.
+
+
+
+
+
-```python
-from browser_use import Browser, sandbox, ChatBrowserUse
-from browser_use.agent.service import Agent
-import asyncio
+We benchmark Browser Use across 100 real-world browser tasks. Full benchmark is open source: **[browser-use/benchmark](https://github.com/browser-use/benchmark)**.
-@sandbox()
-async def my_task(browser: Browser):
- agent = Agent(task="Find the top HN post", browser=browser, llm=ChatBrowserUse())
- await agent.run()
+**Use the Open-Source Agent**
+- You need [custom tools](https://docs.browser-use.com/customize/tools/basics) or deep code-level integration
+- We recommend pairing with our [cloud browsers](https://docs.browser-use.com/open-source/customize/browser/remote) for leading stealth, proxy rotation, and scaling
+- Or self-host the open-source agent fully on your own machines
-# Just call it like any async function
-asyncio.run(my_task())
-```
+**Use the [Fully-Hosted Cloud Agent](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-hosted-agent) (recommended)**
+- Much more powerful agent for complex tasks (see plot above)
+- Easiest way to start and scale
+- Best stealth with proxy rotation and captcha solving
+- 1000+ integrations (Gmail, Slack, Notion, and more)
+- Persistent filesystem and memory
-See [Going to Production](https://docs.browser-use.com/production) for more details.
+
+
+# Demos
+
+
+### 📋 Form-Filling
+#### Task = "Fill in this job application with my resume and information."
+
+[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/apply_to_job.py)
+
+
+### 🍎 Grocery-Shopping
+#### Task = "Put this list of items into my instacart."
+
+https://github.com/user-attachments/assets/a6813fa7-4a7c-40a6-b4aa-382bf88b1850
+
+[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/buy_groceries.py)
+
+
+### 💻 Personal-Assistant.
+#### Task = "Help me find parts for a custom PC."
+
+https://github.com/user-attachments/assets/ac34f75c-057a-43ef-ad06-5b2c9d42bf06
+
+[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/pcpartpicker.py)
+
+
+### 💡See [more examples here ↗](https://docs.browser-use.com/examples) and give us a star!
@@ -170,35 +192,6 @@ curl -o ~/.claude/skills/browser-use/SKILL.md \
-# Demos
-
-
-### 📋 Form-Filling
-#### Task = "Fill in this job application with my resume and information."
-
-[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/apply_to_job.py)
-
-
-### 🍎 Grocery-Shopping
-#### Task = "Put this list of items into my instacart."
-
-https://github.com/user-attachments/assets/a6813fa7-4a7c-40a6-b4aa-382bf88b1850
-
-[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/buy_groceries.py)
-
-
-### 💻 Personal-Assistant.
-#### Task = "Help me find parts for a custom PC."
-
-https://github.com/user-attachments/assets/ac34f75c-057a-43ef-ad06-5b2c9d42bf06
-
-[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/pcpartpicker.py)
-
-
-### 💡See [more examples here ↗](https://docs.browser-use.com/examples) and give us a star!
-
-
-
## Integrations, hosting, custom tools, MCP, and more on our [Docs ↗](https://docs.browser-use.com)
@@ -218,6 +211,15 @@ We optimized **ChatBrowserUse()** specifically for browser automation tasks. On
For other LLM providers, see our [supported models documentation](https://docs.browser-use.com/supported-models).
+
+Should I use the Browser Use system prompt with the open-source preview model?
+
+Yes. If you use `ChatBrowserUse(model='browser-use/bu-30b-a3b-preview')` with a normal `Agent(...)`, Browser Use still sends its default agent system prompt for you.
+
+You do **not** need to add a separate custom "Browser Use system message" just because you switched to the open-source preview model. Only use `extend_system_message` or `override_system_message` when you intentionally want to customize the default behavior for your task.
+
+If you want the best default speed/accuracy, we still recommend the newer hosted `bu-*` models. If you want the open-source preview model, the setup stays the same apart from the `model=` value.
+
Can I use custom tools with the agent?
@@ -249,6 +251,12 @@ agent = Agent(
Yes! Browser-Use is open source and free to use. You only need to choose an LLM provider (like OpenAI, Google, ChatBrowserUse, or run local models with Ollama).
+
+Terms of Service
+
+This open-source library is licensed under the MIT License. For Browser Use services & data policy, see our [Terms of Service](https://browser-use.com/legal/terms-of-service) and [Privacy Policy](https://browser-use.com/privacy/).
+
+
How do I handle authentication?
@@ -263,7 +271,7 @@ These examples show how to maintain sessions and handle authentication seamlessl
How do I solve CAPTCHAs?
-For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [Browser Use Cloud](https://cloud.browser-use.com) which provides stealth browsers designed to avoid detection and CAPTCHA challenges.
+For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [Browser Use Cloud](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-faq-captcha) which provides stealth browsers designed to avoid detection and CAPTCHA challenges.
@@ -271,7 +279,7 @@ For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [B
Chrome can consume a lot of memory, and running many agents in parallel can be tricky to manage.
-For production use cases, use our [Browser Use Cloud API](https://cloud.browser-use.com) which handles:
+For production use cases, use our [Browser Use Cloud API](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-faq-production) which handles:
- Scalable browser infrastructure
- Memory management
- Proxy rotation
diff --git a/browser_use/__init__.py b/browser_use/__init__.py
index d275a4f16..946ceba12 100644
--- a/browser_use/__init__.py
+++ b/browser_use/__init__.py
@@ -52,7 +52,6 @@ if TYPE_CHECKING:
from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.browser import BrowserSession as Browser
- from browser_use.code_use.service import CodeAgent
from browser_use.dom.service import DomService
from browser_use.llm import models
from browser_use.llm.anthropic.chat import ChatAnthropic
@@ -60,6 +59,7 @@ if TYPE_CHECKING:
from browser_use.llm.browser_use.chat import ChatBrowserUse
from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.groq.chat import ChatGroq
+ from browser_use.llm.litellm.chat import ChatLiteLLM
from browser_use.llm.mistral.chat import ChatMistral
from browser_use.llm.oci_raw.chat import ChatOCIRaw
from browser_use.llm.ollama.chat import ChatOllama
@@ -72,8 +72,6 @@ if TYPE_CHECKING:
_LAZY_IMPORTS = {
# Agent service (heavy due to dependencies)
# 'Agent': ('browser_use.agent.service', 'Agent'),
- # Code-use agent (Jupyter notebook-like execution)
- 'CodeAgent': ('browser_use.code_use.service', 'CodeAgent'),
'Agent': ('browser_use.agent.service', 'Agent'),
# System prompt (moderate weight due to agent.views imports)
'SystemPrompt': ('browser_use.agent.prompts', 'SystemPrompt'),
@@ -95,6 +93,7 @@ _LAZY_IMPORTS = {
'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'),
'ChatBrowserUse': ('browser_use.llm.browser_use.chat', 'ChatBrowserUse'),
'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'),
+ 'ChatLiteLLM': ('browser_use.llm.litellm.chat', 'ChatLiteLLM'),
'ChatMistral': ('browser_use.llm.mistral.chat', 'ChatMistral'),
'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'),
'ChatOCIRaw': ('browser_use.llm.oci_raw.chat', 'ChatOCIRaw'),
@@ -131,8 +130,6 @@ def __getattr__(name: str):
__all__ = [
'Agent',
- 'CodeAgent',
- # 'CodeAgent',
'BrowserSession',
'Browser', # Alias for BrowserSession
'BrowserProfile',
@@ -148,6 +145,7 @@ __all__ = [
'ChatAnthropic',
'ChatBrowserUse',
'ChatGroq',
+ 'ChatLiteLLM',
'ChatMistral',
'ChatAzureOpenAI',
'ChatOCIRaw',
diff --git a/browser_use/agent/cloud_events.py b/browser_use/agent/cloud_events.py
index ed7b3c4b3..43142f8b1 100644
--- a/browser_use/agent/cloud_events.py
+++ b/browser_use/agent/cloud_events.py
@@ -8,7 +8,7 @@ from bubus import BaseEvent
from pydantic import Field, field_validator
from uuid_extensions import uuid7str
-MAX_STRING_LENGTH = 100000 # 100K chars ~ 25k tokens should be enough
+MAX_STRING_LENGTH = 500000 # 100K chars ~ 25k tokens should be enough
MAX_URL_LENGTH = 100000
MAX_TASK_LENGTH = 100000
MAX_COMMENT_LENGTH = 2000
@@ -38,6 +38,8 @@ class UpdateAgentTaskEvent(BaseEvent):
raise ValueError('Agent must have _task_start_time attribute')
done_output = agent.history.final_result() if agent.history else None
+ if done_output and len(done_output) > MAX_STRING_LENGTH:
+ done_output = done_output[:MAX_STRING_LENGTH]
return cls(
id=str(agent.task_id),
user_id='', # To be filled by cloud handler
diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py
index 6bbf0b86f..eaf5b091c 100644
--- a/browser_use/agent/gif.py
+++ b/browser_use/agent/gif.py
@@ -108,7 +108,6 @@ def create_history_gif(
font_name = os.path.join(CONFIG.WIN_FONT_DIR, font_name + '.ttf')
regular_font = ImageFont.truetype(font_name, font_size)
title_font = ImageFont.truetype(font_name, title_font_size)
- goal_font = ImageFont.truetype(font_name, goal_font_size)
font_loaded = True
break
except OSError:
@@ -121,8 +120,6 @@ def create_history_gif(
regular_font = ImageFont.load_default()
title_font = ImageFont.load_default()
- goal_font = regular_font
-
# Load logo if requested
logo = None
if show_logo:
@@ -236,8 +233,6 @@ def _create_task_frame(
# Start with base font size (regular + 16)
base_font_size = regular_font.size + 16
min_font_size = max(regular_font.size - 10, 16) # Don't go below 16pt
- max_font_size = base_font_size # Cap at the base font size
-
# Calculate dynamic font size based on text length and complexity
# Longer texts get progressively smaller fonts
text_length = len(task)
diff --git a/browser_use/agent/judge.py b/browser_use/agent/judge.py
index 3d840ef44..d17232721 100644
--- a/browser_use/agent/judge.py
+++ b/browser_use/agent/judge.py
@@ -88,6 +88,8 @@ def construct_judge_messages(
)
)
+ current_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
+
# System prompt for judge - conditionally add ground truth section
ground_truth_section = ''
if ground_truth:
@@ -168,7 +170,7 @@ Set `reached_captcha` to true if:
- **evaluate for action** - For each key step of the trace, double check whether the action that the agent tried to performed actually happened. If the required action did not actually occur, the verdict should be false.
- **screenshot is not entire content** - The agent has the entire DOM content, but the screenshot is only part of the content. If the agent extracts information from the page, but you do not see it in the screenshot, you can assume this information is there.
- **Penalize poor tool usage** - Wrong tools, inefficient approaches, ignoring available information.
-- **ignore unexpected dates and times** - These agent traces are from varying dates, you can assume the dates the agent uses for search or filtering are correct.
+- **current date/time is {current_date}** - content with recent dates is real, not fabricated.
- **IMPORTANT**: be very picky about the user's request - Have very high standard for the agent completing the task exactly to the user's request.
- **IMPORTANT**: be initially doubtful of the agent's self reported success, be sure to verify that its methods are valid and fulfill the user's desires to a tee.
@@ -221,54 +223,3 @@ Evaluate this agent execution given the criteria and respond with the exact JSON
SystemMessage(content=system_prompt),
UserMessage(content=content_parts),
]
-
-
-def construct_simple_judge_messages(
- task: str,
- final_result: str,
-) -> list[BaseMessage]:
- """Construct lightweight judge messages to validate agent success claims.
-
- Always runs regardless of use_judge setting. Text-only — no screenshots,
- no trajectory. Just task + final result.
- """
- task_truncated = _truncate_text(task, 20000)
- final_result_truncated = _truncate_text(final_result, 20000)
-
- current_date = datetime.now(timezone.utc).strftime('%Y-%m-%d')
-
- system_prompt = f"""You are a strict verifier checking whether a browser automation agent actually completed its task.
-
-Today's date is {current_date}. The agent ran recently — dates near today are expected and NOT fabricated.
-
-Given the task and the agent's final response, determine if the response genuinely satisfies ALL requirements.
-
-Check for these common failure patterns:
-1. **Incorrect data**: Wrong number of items, missing filters/criteria, wrong format
-2. **Unverified actions**: Agent claims to have submitted a form, posted a comment, or saved a file but there's no evidence
-3. **Incomplete results**: Some requirements from the task are not addressed in the response
-4. **Fabricated content**: Data that looks plausible but wasn't actually extracted from any page. NOTE: dates and times close to today's date ({current_date}) are NOT fabricated — the agent browses live websites and extracts real-time content.
-5. **Partial completion reported as success**: Response acknowledges failure or blockers (captcha, access denied, etc.) but still claims success
-
-Respond with EXACTLY this JSON structure:
-{{
- "is_correct": true or false,
- "reason": "Brief explanation if not correct, empty string if correct"
-}}
-
-Be strict: if the response doesn't clearly satisfy every requirement, set is_correct to false."""
-
- user_prompt = f"""
-{task_truncated or 'No task provided'}
-
-
-
-{final_result_truncated or 'No response provided'}
-
-
-Does the agent's response fully satisfy all requirements of the task? Respond with the JSON structure."""
-
- return [
- SystemMessage(content=system_prompt),
- UserMessage(content=user_prompt),
- ]
diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
index 5e231c942..6c7cae11a 100644
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -25,7 +25,12 @@ from browser_use.llm.messages import (
UserMessage,
)
from browser_use.observability import observe_debug
-from browser_use.utils import match_url_with_domain_pattern, time_execution_sync
+from browser_use.utils import (
+ collect_sensitive_data_values,
+ match_url_with_domain_pattern,
+ redact_sensitive_string,
+ time_execution_sync,
+)
logger = logging.getLogger(__name__)
@@ -114,6 +119,7 @@ class MessageManager:
include_recent_events: bool = False,
sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
llm_screenshot_size: tuple[int, int] | None = None,
+ max_clickable_elements_length: int = 40000,
):
self.task = task
self.state = state
@@ -127,6 +133,7 @@ class MessageManager:
self.include_recent_events = include_recent_events
self.sample_images = sample_images
self.llm_screenshot_size = llm_screenshot_size
+ self.max_clickable_elements_length = max_clickable_elements_length
assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'
@@ -144,7 +151,13 @@ class MessageManager:
"""Build agent history description from list of items, respecting max_history_items limit"""
compacted_prefix = ''
if self.state.compacted_memory:
- compacted_prefix = f'\n{self.state.compacted_memory}\n \n'
+ compacted_prefix = (
+ '\n'
+ '\n'
+ f'{self.state.compacted_memory}\n'
+ ' \n'
+ )
if self.max_history_items is None:
# Include all items
@@ -247,6 +260,9 @@ class MessageManager:
'You are summarizing an agent run for prompt compaction.\n'
'Capture task requirements, key facts, decisions, partial progress, errors, and next steps.\n'
'Preserve important entities, values, URLs, and file paths.\n'
+ 'CRITICAL: Only mark a step as completed if you see explicit success confirmation in the history. '
+ 'If a step was started but not explicitly confirmed complete, mark it as "IN-PROGRESS". '
+ 'Never infer completion from context — only report what was confirmed.\n'
'Return plain text only. Do not include tool calls or JSON.'
)
if settings.summary_max_chars:
@@ -298,7 +314,6 @@ class MessageManager:
self.state.read_state_images = [] # Clear images from previous step
action_results = ''
- result_len = len(result)
read_state_idx = 0
for idx, action_result in enumerate(result):
@@ -470,6 +485,7 @@ class MessageManager:
include_attributes=self.include_attributes,
step_info=step_info,
page_filtered_actions=page_filtered_actions,
+ max_clickable_elements_length=self.max_clickable_elements_length,
sensitive_data=self.sensitive_data_description,
available_file_paths=available_file_paths,
screenshots=screenshots,
@@ -562,30 +578,14 @@ class MessageManager:
if not self.sensitive_data:
return value
- # Collect all sensitive values, immediately converting old format to new format
- sensitive_values: dict[str, str] = {}
-
- # Process all sensitive data entries
- for key_or_domain, content in self.sensitive_data.items():
- if isinstance(content, dict):
- # Already in new format: {domain: {key: value}}
- for key, val in content.items():
- if val: # Skip empty values
- sensitive_values[key] = val
- elif content: # Old format: {key: value} - convert to new format internally
- # We treat this as if it was {'http*://*': {key_or_domain: content}}
- sensitive_values[key_or_domain] = content
+ sensitive_values = collect_sensitive_data_values(self.sensitive_data)
# If there are no valid sensitive data entries, just return the original value
if not sensitive_values:
logger.warning('No valid entries found in sensitive_data dictionary')
return value
- # Replace all valid sensitive data values with their placeholder tags
- for key, val in sensitive_values.items():
- value = value.replace(val, f'{key} ')
-
- return value
+ return redact_sensitive_string(value, sensitive_values)
if isinstance(message.content, str):
message.content = replace_sensitive(message.content)
diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py
index 7ce050ac5..803b3547d 100644
--- a/browser_use/agent/prompts.py
+++ b/browser_use/agent/prompts.py
@@ -157,6 +157,7 @@ class AgentMessagePrompt:
'images': 0,
'interactive_elements': 0,
'total_elements': 0,
+ 'text_chars': 0,
}
if not self.browser_state.dom_state or not self.browser_state.dom_state._root:
@@ -203,6 +204,9 @@ class AgentMessagePrompt:
else:
stats['shadow_open'] += 1
+ elif original.node_type == NodeType.TEXT_NODE:
+ stats['text_chars'] += len(original.node_value.strip())
+
elif original.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM fragment - these are the actual shadow roots
# But don't double-count since we count them at the host level above
@@ -224,6 +228,9 @@ class AgentMessagePrompt:
stats_text = ''
if page_stats['total_elements'] < 10:
stats_text += 'Page appears empty (SPA not loaded?) - '
+ # Skeleton screen: many elements but almost no text = loading placeholders
+ elif page_stats['total_elements'] > 20 and page_stats['text_chars'] < page_stats['total_elements'] * 5:
+ stats_text += 'Page appears to show skeleton/placeholder content (still loading?) - '
stats_text += f'{page_stats["links"]} links, {page_stats["interactive_elements"]} interactive, '
stats_text += f'{page_stats["iframes"]} iframes'
if page_stats['shadow_open'] > 0 or page_stats['shadow_closed'] > 0:
@@ -252,14 +259,11 @@ class AgentMessagePrompt:
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
has_content_above = pages_above > 0
has_content_below = pages_below > 0
- total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
- current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1)
page_info_text = ''
- page_info_text += f'{pages_above:.1f} above, '
- page_info_text += f'{pages_below:.1f} below '
-
+ page_info_text += f'{pages_above:.1f} pages above, {pages_below:.1f} pages below'
+ if pages_below > 0.2:
+ page_info_text += ' — scroll down to reveal more content'
page_info_text += ' \n'
- # , at {current_page_position:.0%} of page
if elements_text != '':
if not has_content_above:
elements_text = f'[Start of page]\n{elements_text}'
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
index 6bc7757df..9b2fff1f2 100644
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -36,7 +36,7 @@ from pydantic import BaseModel, ValidationError
from uuid_extensions import uuid7str
from browser_use import Browser, BrowserProfile, BrowserSession
-from browser_use.agent.judge import construct_judge_messages, construct_simple_judge_messages
+from browser_use.agent.judge import construct_judge_messages
# Lazy import for gif to avoid heavy agent.views import at startup
# from browser_use.agent.gif import create_history_gif
@@ -59,7 +59,6 @@ from browser_use.agent.views import (
JudgementResult,
MessageCompactionSettings,
PlanItem,
- SimpleJudgeResult,
StepMetadata,
)
from browser_use.browser.events import _get_timeout
@@ -188,6 +187,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
file_system_path: str | None = None,
task_id: str | None = None,
calculate_cost: bool = False,
+ pricing_url: str | None = None,
display_files_in_done_text: bool = True,
include_tool_call_examples: bool = False,
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
@@ -204,7 +204,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
loop_detection_enabled: bool = True,
llm_screenshot_size: tuple[int, int] | None = None,
message_compaction: MessageCompactionSettings | bool | None = True,
+ max_clickable_elements_length: int = 40000,
_url_shortening_limit: int = 25,
+ enable_signal_handler: bool = True,
**kwargs,
):
# Validate llm_screenshot_size
@@ -409,16 +411,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):
loop_detection_window=loop_detection_window,
loop_detection_enabled=loop_detection_enabled,
message_compaction=message_compaction,
+ max_clickable_elements_length=max_clickable_elements_length,
)
# Token cost service
- self.token_cost_service = TokenCost(include_cost=calculate_cost)
+ self.token_cost_service = TokenCost(include_cost=calculate_cost, pricing_url=pricing_url)
self.token_cost_service.register_llm(llm)
self.token_cost_service.register_llm(page_extraction_llm)
self.token_cost_service.register_llm(judge_llm)
if self.settings.message_compaction and self.settings.message_compaction.compaction_llm:
self.token_cost_service.register_llm(self.settings.message_compaction.compaction_llm)
+ # Store signal handler setting (not part of AgentSettings as it's runtime behavior)
+ self.enable_signal_handler = enable_signal_handler
+
# Initialize state
self.state = injected_agent_state or AgentState()
@@ -514,6 +520,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
include_recent_events=self.include_recent_events,
sample_images=self.sample_images,
llm_screenshot_size=llm_screenshot_size,
+ max_clickable_elements_length=self.settings.max_clickable_elements_length,
)
if self.sensitive_data:
@@ -1022,9 +1029,35 @@ class Agent(Generic[Context, AgentStructuredOutput]):
browser_state_summary = None
try:
+ if self.browser_session:
+ try:
+ captcha_wait = await self.browser_session.wait_if_captcha_solving()
+ if captcha_wait and captcha_wait.waited:
+ # Reset step timing to exclude the captcha wait from step duration metrics
+ self.step_start_time = time.time()
+ duration_s = captcha_wait.duration_ms / 1000
+ outcome = captcha_wait.result # 'success' | 'failed' | 'timeout'
+ msg = f'Waited {duration_s:.1f}s for {captcha_wait.vendor} CAPTCHA to be solved. Result: {outcome}.'
+ self.logger.info(f'🔒 {msg}')
+ # Inject the outcome so the LLM sees what happened
+ captcha_result = ActionResult(long_term_memory=msg)
+ if self.state.last_result:
+ self.state.last_result.append(captcha_result)
+ else:
+ self.state.last_result = [captcha_result]
+ except Exception as e:
+ self.logger.warning(f'Phase 0 captcha wait failed (non-fatal): {e}')
+
# Phase 1: Prepare context and timing
browser_state_summary = await self._prepare_context(step_info)
+ # Clear previous step state after context preparation (which needs
+ # them for the "previous action result" prompt) but before the LLM
+ # call, so a timeout during _get_next_action or _execute_actions
+ # won't leave stale data from the previous step.
+ self.state.last_model_output = None
+ self.state.last_result = None
+
# Phase 2: Get model output and execute actions
await self._get_next_action(browser_state_summary)
await self._execute_actions()
@@ -1220,12 +1253,31 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.warning(f'{error_msg}')
return
- # Handle browser closed/disconnected errors - stop immediately instead of retrying
- if self._is_browser_closed_error(error):
- self.logger.warning(f'🛑 Browser closed or disconnected: {error}')
- self.state.stopped = True
- self._external_pause_event.set()
- return
+ # Handle browser closed/disconnected errors
+ if self._is_connection_like_error(error):
+ # If reconnection is in progress, wait for it instead of stopping
+ if self.browser_session.is_reconnecting:
+ wait_timeout = self.browser_session.RECONNECT_WAIT_TIMEOUT
+ self.logger.warning(
+ f'🔄 Connection error during reconnection, waiting up to {wait_timeout}s for reconnect: {error}'
+ )
+ try:
+ await asyncio.wait_for(self.browser_session._reconnect_event.wait(), timeout=wait_timeout)
+ except TimeoutError:
+ pass
+
+ # Check if reconnection succeeded
+ if self.browser_session.is_cdp_connected:
+ self.logger.info('🔄 Reconnection succeeded, retrying step...')
+ self.state.last_result = [ActionResult(error=f'Connection lost and recovered: {error}')]
+ return
+
+ # Not reconnecting or reconnection failed — check if truly terminal
+ if self._is_browser_closed_error(error):
+ self.logger.warning(f'🛑 Browser closed or disconnected: {error}')
+ self.state.stopped = True
+ self._external_pause_event.set()
+ return
# Handle all other exceptions
include_trace = self.logger.isEnabledFor(logging.DEBUG)
@@ -1249,14 +1301,35 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.state.last_result = [ActionResult(error=error_msg)]
return None
+ def _is_connection_like_error(self, error: Exception) -> bool:
+ """Check if the error looks like a CDP/WebSocket connection failure.
+
+ Unlike _is_browser_closed_error(), this does NOT check if the CDP client is None
+ or if reconnection is in progress — it purely looks at the error signature.
+ """
+ error_str = str(error).lower()
+ return (
+ isinstance(error, ConnectionError)
+ or 'websocket connection closed' in error_str
+ or 'connection closed' in error_str
+ or 'browser has been closed' in error_str
+ or 'browser closed' in error_str
+ or 'no browser' in error_str
+ )
+
def _is_browser_closed_error(self, error: Exception) -> bool:
"""Check if the browser has been closed or disconnected.
Only returns True when the error itself is a CDP/WebSocket connection failure
- AND the CDP client is gone. Avoids false positives on unrelated errors
- (element not found, timeouts, parse errors) that happen to coincide with
- a transient None state during reconnects or resets.
+ AND the CDP client is gone AND we're not actively reconnecting.
+ Avoids false positives on unrelated errors (element not found, timeouts,
+ parse errors) that happen to coincide with a transient None state during
+ reconnects or resets.
"""
+ # During reconnection, don't treat connection errors as terminal
+ if self.browser_session.is_reconnecting:
+ return False
+
error_str = str(error).lower()
is_connection_error = (
isinstance(error, ConnectionError)
@@ -1504,46 +1577,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self._message_manager._add_context_message(UserMessage(content=msg))
self.AgentOutput = self.DoneAgentOutput
- async def _run_simple_judge(self) -> None:
- """Lightweight always-on judge that overrides agent success when it overclaims.
-
- Runs regardless of use_judge setting. Only checks tasks where the agent
- claimed success — if the agent already reports failure, there's nothing to correct.
- """
- last_result = self.history.history[-1].result[-1]
- if not last_result.is_done or not last_result.success:
- return
-
- task = self.task
- final_result = self.history.final_result() or ''
-
- messages = construct_simple_judge_messages(
- task=task,
- final_result=final_result,
- )
-
- try:
- response = await self.llm.ainvoke(messages, output_format=SimpleJudgeResult)
- result: SimpleJudgeResult = response.completion # type: ignore[assignment]
- if not result.is_correct:
- reason = result.reason or 'Task requirements not fully met'
- self.logger.info(f'⚠️ Simple judge overriding success to failure: {reason}')
- last_result.success = False
- note = f'[Simple judge: {reason}]'
- # When structured output is expected, don't append judge text to extracted_content
- # as it would corrupt the JSON and break end-user parsers
- if self.output_model_schema is not None:
- if last_result.metadata is None:
- last_result.metadata = {}
- last_result.metadata['simple_judge'] = note
- elif last_result.extracted_content:
- last_result.extracted_content += f'\n\n{note}'
- else:
- last_result.extracted_content = note
- except Exception as e:
- self.logger.warning(f'Simple judge failed with error: {e}')
- # Don't override on error — keep the agent's self-report
-
@observe(ignore_input=True, ignore_output=False)
async def _judge_trace(self) -> JudgementResult | None:
"""Judge the trace of the agent"""
@@ -1614,8 +1647,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if judgement.failure_reason:
judge_log += f' Failure Reason: {judgement.failure_reason}\n'
if judgement.reached_captcha:
- judge_log += ' 🤖 Captcha Detected: Agent encountered captcha challenges\n'
- judge_log += ' 👉 🥷 Use Browser Use Cloud for the most stealth browser infra: https://docs.browser-use.com/customize/browser/remote\n'
+ self.logger.warning(
+ 'Agent was blocked by a captcha. Cloud browsers include stealth fingerprinting and proxy rotation to avoid this.\n'
+ ' Try: Browser(use_cloud=True) | Get an API key: https://cloud.browser-use.com?utm_source=oss&utm_medium=captcha_nudge'
+ )
judge_log += f' {judgement.reasoning}\n'
self.logger.info(judge_log)
@@ -2023,8 +2058,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if not (self.logger.isEnabledFor(logging.DEBUG) and parsed.action):
return
- action_count = len(parsed.action)
-
# Collect action details
action_details = []
for i, action in enumerate(parsed.action):
@@ -2129,11 +2162,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
has_captcha_issue = any(keyword in final_result_str for keyword in captcha_keywords)
if has_captcha_issue:
- # Suggest use_cloud=True for captcha/cloudflare issues
- task_preview = self.task[:10] if len(self.task) > 10 else self.task
- self.logger.info('')
- self.logger.info('Failed because of CAPTCHA? For better browser stealth, try:')
- self.logger.info(f' agent = Agent(task="{task_preview}...", browser=Browser(use_cloud=True))')
+ self.logger.warning(
+ 'Agent was blocked by a captcha. Cloud browsers include stealth fingerprinting and proxy rotation to avoid this.\n'
+ ' Try: Browser(use_cloud=True) | Get an API key: https://cloud.browser-use.com?utm_source=oss&utm_medium=captcha_nudge'
+ )
# General failure message
self.logger.info('')
@@ -2225,9 +2257,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
await self.step(step_info)
if self.history.is_done():
- # Always run simple judge to align agent success with reality
- await self._run_simple_judge()
-
await self.log_completion()
# Run full judge before done callback if enabled
@@ -2424,14 +2453,15 @@ class Agent(Generic[Context, AgentStructuredOutput]):
await self._demo_mode_log(error_msg, 'error', {'step': step + 1})
self.state.consecutive_failures += 1
self.state.last_result = [ActionResult(error=error_msg)]
+ # Ensure step counter advances on timeout — _finalize() may have
+ # been skipped or returned early due to the cancellation.
+ if self.state.n_steps == step + 1:
+ self.state.n_steps += 1
if on_step_end is not None:
await on_step_end(self)
if self.history.is_done():
- # Always run simple judge to align agent success with reality
- await self._run_simple_judge()
-
await self.log_completion()
# Run full judge before done callback if enabled
@@ -2480,6 +2510,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
resume_callback=self.resume,
custom_exit_callback=on_force_exit_log_telemetry, # Pass the new telemetrycallback
exit_on_second_int=True,
+ disabled=not self.enable_signal_handler,
)
signal_handler.register()
@@ -2672,7 +2703,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
to pre-action values. Any change aborts the remaining queue.
"""
results: list[ActionResult] = []
- time_elapsed = 0
total_actions = len(actions)
assert self.browser_session is not None, 'BrowserSession is not set up'
@@ -2682,19 +2712,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):
and self.browser_session._cached_browser_state_summary.dom_state is not None
):
cached_selector_map = dict(self.browser_session._cached_browser_state_summary.dom_state.selector_map)
- cached_element_hashes = {e.parent_branch_hash() for e in cached_selector_map.values()}
else:
cached_selector_map = {}
- cached_element_hashes = set()
except Exception as e:
self.logger.error(f'Error getting cached selector map: {e}')
cached_selector_map = {}
- cached_element_hashes = set()
for i, action in enumerate(actions):
+ # Get action name from the action model BEFORE try block to ensure it's always available in except
+ action_data = action.model_dump(exclude_unset=True)
+ action_name = next(iter(action_data.keys())) if action_data else 'unknown'
+
if i > 0:
# ONLY ALLOW TO CALL `done` IF IT IS A SINGLE ACTION
- if action.model_dump(exclude_unset=True).get('done') is not None:
+ if action_data.get('done') is not None:
msg = f'Done action is allowed only as a single action - stopped after action {i} / {total_actions}.'
self.logger.debug(msg)
break
@@ -2706,9 +2737,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
try:
await self._check_stop_or_pause()
- # Get action name from the action model
- action_data = action.model_dump(exclude_unset=True)
- action_name = next(iter(action_data.keys())) if action_data else 'unknown'
# Log action before execution
await self._log_action(action, action_name, i + 1, total_actions)
@@ -2717,8 +2745,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
pre_action_url = await self.browser_session.get_current_page_url()
pre_action_focus = self.browser_session.agent_focus_target_id
- time_start = time.time()
-
result = await self.tools.act(
action=action,
browser_session=self.browser_session,
@@ -2729,9 +2755,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
extraction_schema=self.extraction_schema,
)
- time_end = time.time()
- time_elapsed = time_end - time_start
-
if result.error:
await self._demo_mode_log(
f'Action "{action_name}" failed: {result.error}',
@@ -3429,7 +3452,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
hist_node = historical_elem.node_name.lower() if historical_elem else ''
similar_elements = []
if historical_elem and historical_elem.attributes:
- hist_aria = historical_elem.attributes.get('aria-label', '')
for idx, elem in selector_map.items():
if elem.node_name.lower() == hist_node and elem.attributes:
elem_aria = elem.attributes.get('aria-label', '')
@@ -3911,6 +3933,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Kill the browser session - this dispatches BrowserStopEvent,
# stops the EventBus with clear=True, and recreates a fresh EventBus
await self.browser_session.kill()
+ else:
+ # keep_alive=True sessions shouldn't keep the event loop alive after agent.run()
+ await self.browser_session.event_bus.stop(
+ clear=False,
+ timeout=_get_timeout('TIMEOUT_BrowserSessionEventBusStopOnAgentClose', 1.0),
+ )
+ try:
+ self.browser_session.event_bus.event_queue = None
+ self.browser_session.event_bus._on_idle = None
+ except Exception:
+ pass
# Close skill service if configured
if self.skill_service is not None:
diff --git a/browser_use/agent/system_prompts/system_prompt.md b/browser_use/agent/system_prompts/system_prompt.md
index 9af905048..82cb2ca32 100644
--- a/browser_use/agent/system_prompts/system_prompt.md
+++ b/browser_use/agent/system_prompts/system_prompt.md
@@ -40,18 +40,25 @@ USER REQUEST: This is your ultimate objective and always remains visible.
1. Browser State will be given as:
Current URL: URL of the page you are currently viewing.
Open Tabs: Open tabs with their ids.
-Interactive Elements: All interactive elements will be provided in format as [index]text where
-- index: Numeric identifier for interaction
-- type: HTML element type (button, input, etc.)
-- text: Element description
+Interactive Elements: All interactive elements will be provided in a tree-style XML format:
+- Format: `[index] ` for interactive elements
+- Text content appears as child nodes on separate lines (not inside tags)
+- Indentation with tabs shows parent/child relationships
Examples:
-[33]User form
-\t*[35]Submit
+[33]
+ User form
+ [35]
+ *[38]
+ Submit
+[40]
+ About us
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input you might need to select the right option from the list.
-- Pure text elements without [] are not interactive.
+- Pure text elements without [] are not interactive
+- `|SCROLL|` prefix indicates scrollable containers with scroll position info
+- `|SHADOW(open)|` or `|SHADOW(closed)|` prefix indicates shadow DOM elements
If you used screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
@@ -65,14 +72,14 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed.
-- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). Do not spend more than 3-4 steps on a single captcha - if blocked, try alternative approaches or report the limitation.
+- CAPTCHAs are automatically solved by the browser. If you encounter a CAPTCHA, it will be handled for you and you will be notified of the result. Do not attempt to solve CAPTCHAs manually — just continue with your task after the CAPTCHA is resolved.
- If the page is not fully loaded, use the wait action.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the .
- Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- Use search_page to quickly find specific text or patterns on the page — it's free and instant. Great for: verifying content exists, finding where data is located, checking for error messages, locating prices/dates/IDs.
- Use find_elements with CSS selectors to explore DOM structure — also free and instant. Great for: counting items (e.g. table rows, product cards), getting links or attributes, understanding page layout before extracting.
-- Prefer search_page and find_elements over scrolling when looking for specific content not visible in browser_state.
+- Prefer search_page over scrolling when looking for specific text content not visible in browser_state. Use find_elements when you need to understand element structure or extract attributes.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the includes specific page information such as product type, rating, price, location, etc., ALWAYS look for filter/sort options FIRST before browsing results. Apply all relevant filters before scrolling through results.
@@ -84,7 +91,7 @@ Strictly follow these rules while using the browser and navigating the web:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
-- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
+- If you get stuck e.g. with logins in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search. CAPTCHAs are handled automatically.
- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in . You can either read the file or scroll in the page to see more.
- Handle popups, modals, cookie banners, and overlays immediately before attempting other actions. Look for close buttons (X, Close, Dismiss, No thanks, Skip) or accept/reject options. If a popup blocks interaction with the main page, handle it first.
- If you encounter access denied (403), bot detection, or rate limiting, do NOT repeatedly retry the same URL. Try alternative approaches or report the limitation.
@@ -138,9 +145,9 @@ BEFORE calling `done` with `success=true`, you MUST perform this verification:
3. **Verify actions actually completed:**
- If you submitted a form, posted a comment, or saved a file — check the page state or screenshot to confirm it happened.
- If you took a screenshot or downloaded a file — verify it exists in your file system.
-4. **Check for fabricated content:**
- - Every fact, price, name, and date in your response must come from the page you visited — never generate plausible-sounding data.
-5. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
+4. **Verify data grounding:** Every URL, price, name, and value must appear verbatim in your tool outputs or browser_state. Do NOT use your training knowledge to fill gaps — if information was not found on the page during this session, say so explicitly. Never fabricate or invent values.
+5. **Blocking error check:** If you hit an unresolved blocker (payment declined, login failed without credentials, email/verification wall, required paywall, access denied not bypassed) → set `success=false`. Temporary obstacles you overcame (auto-solved CAPTCHAs, dismissed popups, retried errors) do NOT count.
+6. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
Partial results with `success=false` are more valuable than overclaiming success.
@@ -154,9 +161,11 @@ Check the browser state each step to verify your previous action achieved its go
You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page.
**Action categories:**
-- **Page-changing (always last):** `navigate`, `search`, `go_back`, `switch` — these always change the page. Remaining actions after them are skipped automatically.
-- **Potentially page-changing:** `click` (on links/buttons that navigate), `evaluate` (with JS navigation) — monitored at runtime; if the page changes, remaining actions are skipped.
-- **Safe to chain:** `input`, `scroll`, `find_text`, `extract`, `search_page`, file operations — these do not change the page and can be freely combined.
+- **Page-changing (always last):** `navigate`, `search`, `go_back`, `switch`, `evaluate` — these always change the page. Remaining actions after them are skipped automatically. Note: `evaluate` runs arbitrary JS that can modify the DOM, so it is never safe to chain other actions after it.
+- **Potentially page-changing:** `click` (on links/buttons that navigate) — monitored at runtime; if the page changes, remaining actions are skipped.
+- **Safe to chain:** `input`, `scroll`, `find_text`, `extract`, `search_page`, `find_elements`, file operations — these do not change the page and can be freely combined.
+
+**Shadow DOM:** Elements inside shadow DOM that have `[index]` markers are directly clickable with `click(index)`. Do NOT use `evaluate` to click them.
**Recommended combinations:**
- `input` + `input` + `input` + `click` → Fill multiple form fields then submit
@@ -239,7 +248,7 @@ Action list should NEVER be empty.
3. ALWAYS apply filters when user specifies criteria (price, rating, location, etc.)
4. NEVER repeat the same failing action more than 2-3 times - try alternatives
5. NEVER assume success - always verify from screenshot or browser state
-6. If blocked by captcha/login/403, try alternative approaches rather than retrying
+6. CAPTCHAs are solved automatically. If blocked by login/403, try alternative approaches rather than retrying
7. Put ALL relevant findings in done action's text field
8. Match user's requested output format exactly
9. Track progress in memory to avoid loops
@@ -253,7 +262,7 @@ When encountering errors or unexpected states:
2. Check if a popup, modal, or overlay is blocking interaction
3. If an element is not found, scroll to reveal more content
4. If an action fails repeatedly (2-3 times), try an alternative approach
-5. If blocked by login/captcha/403, consider alternative sites or search engines
+5. If blocked by login/403, consider alternative sites or search engines. CAPTCHAs are solved automatically.
6. If the page structure is different than expected, re-analyze and adapt
7. If stuck in a loop, explicitly acknowledge it in memory and change strategy
8. If max_steps is approaching, prioritize completing the most important parts of the task
diff --git a/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md b/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md
index 524b006be..05a34b8e7 100644
--- a/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md
+++ b/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md
@@ -31,7 +31,7 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Scroll to see more elements if needed.
-- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). Do not spend more than 3-4 steps on a single captcha - if blocked, try alternative approaches or report the limitation.
+- CAPTCHAs are automatically solved by the browser. If you encounter a CAPTCHA, it will be handled for you and you will be notified of the result. Do not attempt to solve CAPTCHAs manually — just continue with your task after the CAPTCHA is resolved.
- If the page is not fully loaded, use the wait action to allow content to render.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the .
@@ -46,7 +46,7 @@ Strictly follow these rules while using the browser and navigating the web:
- There are 2 types of tasks:
1. Very specific step by step instructions: Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
-- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
+- If you get stuck e.g. with logins in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search. CAPTCHAs are handled automatically.
- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in . You can either read the file or scroll in the page to see more.
- Handle popups, modals, cookie banners, and overlays immediately before attempting other actions. Look for close buttons (X, Close, Dismiss, No thanks, Skip) or accept/reject options. If a popup blocks interaction with the main page, handle it first. Many websites show cookie consent dialogs, newsletter popups, or promotional overlays that must be dismissed.
- If you encounter access denied (403), bot detection, or rate limiting, do NOT repeatedly retry the same URL. Try alternative approaches or report the limitation. Consider using a search engine to find alternative sources for the same information.
@@ -93,9 +93,9 @@ BEFORE calling `done` with `success=true`, you MUST perform this verification:
3. **Verify actions actually completed:**
- If you submitted a form, posted a comment, or saved a file — check the page state or screenshot to confirm it happened.
- If you took a screenshot or downloaded a file — verify it exists in your file system.
-4. **Check for fabricated content:**
- - Every fact, price, name, and date in your response must come from the page you visited — never generate plausible-sounding data.
-5. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
+4. **Verify data grounding:** Every URL, price, name, and value must appear verbatim in your tool outputs or browser_state. Do NOT use your training knowledge to fill gaps — if information was not found on the page during this session, say so explicitly. Never fabricate or invent values.
+5. **Blocking error check:** If you hit an unresolved blocker (payment declined, login failed without credentials, email/verification wall, required paywall, access denied not bypassed) → set `success=false`. Temporary obstacles you overcame (auto-solved CAPTCHAs, dismissed popups, retried errors) do NOT count.
+6. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
Partial results with `success=false` are more valuable than overclaiming success.
@@ -166,7 +166,7 @@ Always put `memory` field before the `action` field.
Your memory field should include your reasoning. Apply these patterns:
- Did the previous action succeed? Verify using screenshot as ground truth.
- What is the current state relative to the user request?
-- Are there any obstacles (popups, captcha, login walls)?
+- Are there any obstacles (popups, login walls)? CAPTCHAs are solved automatically.
- What specific next step will make progress toward the goal?
- If stuck, what alternative approach should you try?
- What information should be remembered for later steps?
@@ -219,7 +219,7 @@ When encountering errors or unexpected states:
2. Check if a popup, modal, or overlay is blocking interaction
3. If an element is not found, scroll to reveal more content
4. If an action fails repeatedly (2-3 times), try an alternative approach
-5. If blocked by login/captcha/403, consider alternative sites or search engines
+5. If blocked by login/403, consider alternative sites or search engines. CAPTCHAs are solved automatically.
6. If the page structure is different than expected, re-analyze and adapt
7. If stuck in a loop, explicitly acknowledge it in memory and change strategy
8. If max_steps is approaching, prioritize completing the most important parts of the task
@@ -230,7 +230,7 @@ When encountering errors or unexpected states:
3. ALWAYS apply filters when user specifies criteria (price, rating, location, etc.)
4. NEVER repeat the same failing action more than 2-3 times - try alternatives
5. NEVER assume success - always verify from screenshot or browser state
-6. If blocked by captcha/login/403, try alternative approaches rather than retrying
+6. CAPTCHAs are solved automatically. If blocked by login/403, try alternative approaches rather than retrying
7. Put ALL relevant findings in done action's text field
8. Match user's requested output format exactly
9. Track progress in memory to avoid loops
diff --git a/browser_use/agent/system_prompts/system_prompt_browser_use.md b/browser_use/agent/system_prompts/system_prompt_browser_use.md
index 2971d9b10..3b5fe1d03 100644
--- a/browser_use/agent/system_prompts/system_prompt_browser_use.md
+++ b/browser_use/agent/system_prompts/system_prompt_browser_use.md
@@ -1,5 +1,9 @@
You are a browser-use agent operating in thinking mode. You automate browser tasks by outputting structured JSON actions.
+
+Instructions containing "do NOT", "never", "avoid", "skip", or "only X" are hard constraints. Before each action, check: does this violate any constraint? If yes, stop and find an alternative.
+
+
You must ALWAYS respond with a valid JSON in this exact format:
{{
@@ -10,4 +14,5 @@ You must ALWAYS respond with a valid JSON in this exact format:
"action": [{{"action_name": {{...params...}}}}]
}}
Action list should NEVER be empty.
+DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
diff --git a/browser_use/agent/system_prompts/system_prompt_browser_use_flash.md b/browser_use/agent/system_prompts/system_prompt_browser_use_flash.md
index e4cb73ea8..435d77a85 100644
--- a/browser_use/agent/system_prompts/system_prompt_browser_use_flash.md
+++ b/browser_use/agent/system_prompts/system_prompt_browser_use_flash.md
@@ -1,5 +1,9 @@
You are a browser-use agent operating in flash mode. You automate browser tasks by outputting structured JSON actions.
+
+Instructions containing "do NOT", "never", "avoid", "skip", or "only X" are hard constraints. Before each action, check: does this violate any constraint? If yes, stop and find an alternative.
+
+
You must respond with a valid JSON in this exact format:
{{
@@ -7,4 +11,5 @@ You must respond with a valid JSON in this exact format:
"action": [{{"action_name": {{...params...}}}}]
}}
Action list should NEVER be empty.
+DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
diff --git a/browser_use/agent/system_prompts/system_prompt_browser_use_no_thinking.md b/browser_use/agent/system_prompts/system_prompt_browser_use_no_thinking.md
index 6d1936e78..e33b4f978 100644
--- a/browser_use/agent/system_prompts/system_prompt_browser_use_no_thinking.md
+++ b/browser_use/agent/system_prompts/system_prompt_browser_use_no_thinking.md
@@ -1,5 +1,9 @@
You are a browser-use agent. You automate browser tasks by outputting structured JSON actions.
+
+Instructions containing "do NOT", "never", "avoid", "skip", or "only X" are hard constraints. Before each action, check: does this violate any constraint? If yes, stop and find an alternative.
+
+
You must ALWAYS respond with a valid JSON in this exact format:
{{
@@ -9,4 +13,5 @@ You must ALWAYS respond with a valid JSON in this exact format:
"action": [{{"action_name": {{...params...}}}}]
}}
Action list should NEVER be empty.
+DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
diff --git a/browser_use/agent/system_prompts/system_prompt_flash.md b/browser_use/agent/system_prompts/system_prompt_flash.md
index 4ef82501c..a254a49a5 100644
--- a/browser_use/agent/system_prompts/system_prompt_flash.md
+++ b/browser_use/agent/system_prompts/system_prompt_flash.md
@@ -12,4 +12,5 @@ You are allowed to use a maximum of {max_actions} actions per step. Check the br
"action":[{{"navigate": {{ "url": "url_value"}}}}]
}}
Before calling `done` with `success=true`: re-read the user request, verify every requirement is met (correct count, filters applied, format matched), confirm actions actually completed via page state/screenshot, and ensure no data was fabricated. If anything is unmet or uncertain, set `success` to `false`.
+DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found in the browser state or tool outputs, say so explicitly. Never fabricate values.
diff --git a/browser_use/agent/system_prompts/system_prompt_flash_anthropic.md b/browser_use/agent/system_prompts/system_prompt_flash_anthropic.md
index fa0291b12..c210b16fc 100644
--- a/browser_use/agent/system_prompts/system_prompt_flash_anthropic.md
+++ b/browser_use/agent/system_prompts/system_prompt_flash_anthropic.md
@@ -27,4 +27,5 @@ You are allowed to use a maximum of {max_actions} actions per step. Check the br
Always put `memory` field before the `action` field.
Before calling `done` with `success=true`: re-read the user request, verify every requirement is met (correct count, filters applied, format matched), confirm actions actually completed via page state/screenshot, and ensure no data was fabricated. If anything is unmet or uncertain, set `success` to `false`.
+DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
diff --git a/browser_use/agent/system_prompts/system_prompt_no_thinking.md b/browser_use/agent/system_prompts/system_prompt_no_thinking.md
index 21eee5e49..f61b4168a 100644
--- a/browser_use/agent/system_prompts/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompts/system_prompt_no_thinking.md
@@ -65,7 +65,7 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed.
-- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). Do not spend more than 3-4 steps on a single captcha - if blocked, try alternative approaches or report the limitation.
+- CAPTCHAs are automatically solved by the browser. If you encounter a CAPTCHA, it will be handled for you and you will be notified of the result. Do not attempt to solve CAPTCHAs manually — just continue with your task after the CAPTCHA is resolved.
- If the page is not fully loaded, use the wait action.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the .
@@ -81,7 +81,7 @@ Strictly follow these rules while using the browser and navigating the web:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
-- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
+- If you get stuck e.g. with logins in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search. CAPTCHAs are handled automatically.
- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in . You can either read the file or scroll in the page to see more.
- Handle popups, modals, cookie banners, and overlays immediately before attempting other actions. Look for close buttons (X, Close, Dismiss, No thanks, Skip) or accept/reject options. If a popup blocks interaction with the main page, handle it first.
- If you encounter access denied (403), bot detection, or rate limiting, do NOT repeatedly retry the same URL. Try alternative approaches or report the limitation.
@@ -130,9 +130,9 @@ BEFORE calling `done` with `success=true`, you MUST perform this verification:
3. **Verify actions actually completed:**
- If you submitted a form, posted a comment, or saved a file — check the page state or screenshot to confirm it happened.
- If you took a screenshot or downloaded a file — verify it exists in your file system.
-4. **Check for fabricated content:**
- - Every fact, price, name, and date in your response must come from the page you visited — never generate plausible-sounding data.
-5. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
+4. **Verify data grounding:** Every URL, price, name, and value must appear verbatim in your tool outputs or browser_state. Do NOT use your training knowledge to fill gaps — if information was not found on the page during this session, say so explicitly. Never fabricate or invent values.
+5. **Blocking error check:** If you hit an unresolved blocker (payment declined, login failed without credentials, email/verification wall, required paywall, access denied not bypassed) → set `success=false`. Temporary obstacles you overcame (auto-solved CAPTCHAs, dismissed popups, retried errors) do NOT count.
+6. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
Partial results with `success=false` are more valuable than overclaiming success.
@@ -224,7 +224,7 @@ Action list should NEVER be empty.
3. ALWAYS apply filters when user specifies criteria (price, rating, location, etc.)
4. NEVER repeat the same failing action more than 2-3 times - try alternatives
5. NEVER assume success - always verify from screenshot or browser state
-6. If blocked by captcha/login/403, try alternative approaches rather than retrying
+6. CAPTCHAs are solved automatically. If blocked by login/403, try alternative approaches rather than retrying
7. Put ALL relevant findings in done action's text field
8. Match user's requested output format exactly
9. Track progress in memory to avoid loops
@@ -238,7 +238,7 @@ When encountering errors or unexpected states:
2. Check if a popup, modal, or overlay is blocking interaction
3. If an element is not found, scroll to reveal more content
4. If an action fails repeatedly (2-3 times), try an alternative approach
-5. If blocked by login/captcha/403, consider alternative sites or search engines
+5. If blocked by login/403, consider alternative sites or search engines. CAPTCHAs are solved automatically.
6. If the page structure is different than expected, re-analyze and adapt
7. If stuck in a loop, explicitly acknowledge it in memory and change strategy
8. If max_steps is approaching, prioritize completing the most important parts of the task
diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py
index b73afc5e1..dbec9a534 100644
--- a/browser_use/agent/views.py
+++ b/browser_use/agent/views.py
@@ -27,6 +27,7 @@ from browser_use.filesystem.file_system import FileSystemState
from browser_use.llm.base import BaseChatModel
from browser_use.tokens.views import UsageSummary
from browser_use.tools.registry.views import ActionModel
+from browser_use.utils import collect_sensitive_data_values, redact_sensitive_string
logger = logging.getLogger(__name__)
@@ -35,7 +36,7 @@ class MessageCompactionSettings(BaseModel):
"""Summarizes older history into a compact memory block to reduce prompt size."""
enabled: bool = True
- compact_every_n_steps: int = 15
+ compact_every_n_steps: int = 25
trigger_char_count: int | None = None # Min char floor; set via trigger_token_count if preferred
trigger_token_count: int | None = None # Alternative to trigger_char_count (~4 chars/token)
chars_per_token: float = 4.0
@@ -88,6 +89,7 @@ class AgentSettings(BaseModel):
# Loop detection settings
loop_detection_window: int = 20 # Rolling window size for action similarity tracking
loop_detection_enabled: bool = True # Whether to enable loop detection nudges
+ max_clickable_elements_length: int = 40000 # Max characters for clickable elements in prompt
class PageFingerprint(BaseModel):
@@ -302,13 +304,6 @@ class JudgementResult(BaseModel):
)
-class SimpleJudgeResult(BaseModel):
- """Result of lightweight always-on judge that validates agent success claims."""
-
- is_correct: bool = Field(description='True if the agent response genuinely satisfies the task requirements')
- reason: str = Field(default='', description='Brief explanation if not correct')
-
-
class ActionResult(BaseModel):
"""Result of executing an action"""
@@ -518,29 +513,13 @@ class AgentHistory(BaseModel):
if not sensitive_data:
return value
- # Collect all sensitive values, immediately converting old format to new format
- sensitive_values: dict[str, str] = {}
-
- # Process all sensitive data entries
- for key_or_domain, content in sensitive_data.items():
- if isinstance(content, dict):
- # Already in new format: {domain: {key: value}}
- for key, val in content.items():
- if val: # Skip empty values
- sensitive_values[key] = val
- elif content: # Old format: {key: value} - convert to new format internally
- # We treat this as if it was {'http*://*': {key_or_domain: content}}
- sensitive_values[key_or_domain] = content
+ sensitive_values = collect_sensitive_data_values(sensitive_data)
# If there are no valid sensitive data entries, just return the original value
if not sensitive_values:
return value
- # Replace all valid sensitive data values with their placeholder tags
- for key, val in sensitive_values.items():
- value = value.replace(val, f'{key} ')
-
- return value
+ return redact_sensitive_string(value, sensitive_values)
def _filter_sensitive_data_from_dict(
self, data: dict[str, Any], sensitive_data: dict[str, str | dict[str, str]] | None
@@ -651,7 +630,7 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
data = self.model_dump(sensitive_data=sensitive_data)
with open(filepath, 'w', encoding='utf-8') as f:
- json.dump(data, f, indent=2)
+ json.dump(data, f, indent=2, ensure_ascii=False)
except Exception as e:
raise e
@@ -696,14 +675,18 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
@classmethod
def load_from_dict(cls, data: dict[str, Any], output_model: type[AgentOutput]) -> AgentHistoryList:
# loop through history and validate output_model actions to enrich with custom actions
- for h in data['history']:
- if h['model_output']:
- if isinstance(h['model_output'], dict):
- h['model_output'] = output_model.model_validate(h['model_output'])
+ for h in data.get('history', []):
+ # Use .get() to avoid KeyError on incomplete or legacy history entries
+ model_output = h.get('model_output')
+ if model_output:
+ if isinstance(model_output, dict):
+ h['model_output'] = output_model.model_validate(model_output)
else:
h['model_output'] = None
- if 'interacted_element' not in h['state']:
- h['state']['interacted_element'] = None
+ state = h.get('state') or {}
+ if 'interacted_element' not in state:
+ state['interacted_element'] = None
+ h['state'] = state
history = cls.model_validate(data)
return history
@@ -733,8 +716,10 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
def final_result(self) -> None | str:
"""Final result from history"""
- if self.history and self.history[-1].result[-1].extracted_content:
- return self.history[-1].result[-1].extracted_content
+ if self.history and len(self.history[-1].result) > 0:
+ last_result = self.history[-1].result[-1]
+ if last_result.extracted_content:
+ return last_result.extracted_content
return None
def is_done(self) -> bool:
diff --git a/browser_use/browser/cloud/cloud.py b/browser_use/browser/cloud/cloud.py
index 78f4eccf4..c9670394d 100644
--- a/browser_use/browser/cloud/cloud.py
+++ b/browser_use/browser/cloud/cloud.py
@@ -50,7 +50,8 @@ class CloudBrowserClient:
if not api_token:
raise CloudBrowserAuthError(
- 'No authentication token found. Please set BROWSER_USE_API_KEY environment variable to authenticate with the cloud service. You can also create an API key at https://cloud.browser-use.com/new-api-key'
+ 'BROWSER_USE_API_KEY is not set. To use cloud browsers, get a key at:\n'
+ 'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=use_cloud'
)
headers = {'X-Browser-Use-API-Key': api_token, 'Content-Type': 'application/json', **(extra_headers or {})}
@@ -65,7 +66,8 @@ class CloudBrowserClient:
if response.status_code == 401:
raise CloudBrowserAuthError(
- 'Authentication failed. Please make sure you have set BROWSER_USE_API_KEY environment variable to authenticate with the cloud service. You can also create an API key at https://cloud.browser-use.com/new-api-key'
+ 'BROWSER_USE_API_KEY is invalid. Get a new key at:\n'
+ 'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=use_cloud'
)
elif response.status_code == 403:
raise CloudBrowserAuthError('Access forbidden. Please check your browser-use cloud subscription status.')
@@ -137,7 +139,8 @@ class CloudBrowserClient:
if not api_token:
raise CloudBrowserAuthError(
- 'No authentication token found. Please set BROWSER_USE_API_KEY environment variable to authenticate with the cloud service. You can also create an API key at https://cloud.browser-use.com/new-api-key'
+ 'BROWSER_USE_API_KEY is not set. To use cloud browsers, get a key at:\n'
+ 'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=use_cloud'
)
headers = {'X-Browser-Use-API-Key': api_token, 'Content-Type': 'application/json', **(extra_headers or {})}
@@ -192,7 +195,10 @@ class CloudBrowserClient:
raise CloudBrowserError(f'Unexpected error stopping cloud browser: {e}')
async def close(self):
- """Close the HTTP client and cleanup any active sessions."""
+ """Close the HTTP client and cleanup any active sessions.
+
+ Safe to call multiple times — subsequent calls are no-ops.
+ """
# Try to stop current session if active
if self.current_session_id:
try:
@@ -200,4 +206,5 @@ class CloudBrowserClient:
except Exception as e:
logger.debug(f'Failed to stop cloud browser session during cleanup: {e}')
- await self.client.aclose()
+ if not self.client.is_closed:
+ await self.client.aclose()
diff --git a/browser_use/browser/cloud/views.py b/browser_use/browser/cloud/views.py
index cb378dd2c..20459c369 100644
--- a/browser_use/browser/cloud/views.py
+++ b/browser_use/browser/cloud/views.py
@@ -59,6 +59,13 @@ class CreateBrowserRequest(BaseModel):
title='Cloud Timeout',
)
+ enable_recording: bool = Field(
+ default=False,
+ alias='enableRecording',
+ description='Enable session recording for playback in the cloud dashboard.',
+ title='Enable Recording',
+ )
+
CloudBrowserParams = CreateBrowserRequest # alias for easier readability
diff --git a/browser_use/browser/events.py b/browser_use/browser/events.py
index d7c0d9377..90aea0e9b 100644
--- a/browser_use/browser/events.py
+++ b/browser_use/browser/events.py
@@ -119,7 +119,7 @@ class NavigateToUrlEvent(BaseEvent[None]):
# existing_tab: PageHandle | None = None # TODO
# time limits enforced by bubus, not exposed to LLM:
- event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_NavigateToUrlEvent', 15.0)) # seconds
+ event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_NavigateToUrlEvent', 30.0)) # seconds
class ClickElementEvent(ElementSelectedEvent[dict[str, Any] | None]):
@@ -406,7 +406,7 @@ class TabClosedEvent(BaseEvent):
# new_focus_target_id: int | None = None
# new_focus_url: str | None = None
- event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_TabClosedEvent', 10.0)) # seconds
+ event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_TabClosedEvent', 3.0)) # seconds
# TODO: emit this when DOM changes significantly, inner frame navigates, form submits, history.pushState(), etc.
@@ -471,6 +471,26 @@ class BrowserErrorEvent(BaseEvent):
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_BrowserErrorEvent', 30.0)) # seconds
+class BrowserReconnectingEvent(BaseEvent):
+ """WebSocket reconnection attempt is starting."""
+
+ cdp_url: str
+ attempt: int
+ max_attempts: int
+
+ event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_BrowserReconnectingEvent', 30.0)) # seconds
+
+
+class BrowserReconnectedEvent(BaseEvent):
+ """WebSocket reconnection succeeded."""
+
+ cdp_url: str
+ attempt: int
+ downtime_seconds: float
+
+ event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_BrowserReconnectedEvent', 30.0)) # seconds
+
+
# ============================================================================
# Storage State Events
# ============================================================================
@@ -576,6 +596,42 @@ class DialogOpenedEvent(BaseEvent):
# target_id: TargetID # TODO: add this to avoid needing target_id_from_frame() later
+# ============================================================================
+# Captcha Solver Events
+# ============================================================================
+
+
+class CaptchaSolverStartedEvent(BaseEvent):
+ """Captcha solving started by the browser proxy.
+
+ Emitted when the browser proxy detects a CAPTCHA and begins solving it.
+ The agent should wait for a corresponding CaptchaSolverFinishedEvent before proceeding.
+ """
+
+ target_id: TargetID
+ vendor: str # e.g. 'cloudflare', 'recaptcha', 'hcaptcha', 'datadome', 'perimeterx', 'geetest'
+ url: str
+ started_at: int # Unix millis
+
+ event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_CaptchaSolverStartedEvent', 5.0))
+
+
+class CaptchaSolverFinishedEvent(BaseEvent):
+ """Captcha solving finished by the browser proxy.
+
+ Emitted when the browser proxy finishes solving a CAPTCHA (successfully or not).
+ """
+
+ target_id: TargetID
+ vendor: str
+ url: str
+ duration_ms: int
+ finished_at: int # Unix millis
+ success: bool # Whether the captcha was solved successfully
+
+ event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_CaptchaSolverFinishedEvent', 5.0))
+
+
# Note: Model rebuilding for forward references is handled in the importing modules
# Events with 'EnhancedDOMTreeNode' forward references (ClickElementEvent, TypeTextEvent,
# ScrollEvent, UploadFileEvent) need model_rebuild() called after imports are complete
diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py
index 7b506a783..634eb6778 100644
--- a/browser_use/browser/profile.py
+++ b/browser_use/browser/profile.py
@@ -124,7 +124,7 @@ CHROME_DEFAULT_ARGS = [
'--disable-back-forward-cache', # Avoids surprises like main request not being intercepted during page.goBack().
'--disable-breakpad',
'--disable-client-side-phishing-detection',
- '--disable-component-extensions-with-background-pages',
+ # '--disable-component-extensions-with-background-pages', # kills user-loaded extensions on Chrome 145+
'--disable-component-update', # Avoids unneeded network activity after startup.
'--no-default-browser-check',
# '--disable-default-apps',
@@ -150,7 +150,7 @@ CHROME_DEFAULT_ARGS = [
# added by us:
'--enable-features=NetworkService,NetworkServiceInProcess',
'--enable-network-information-downlink-max',
- '--test-type=gpu',
+ # '--test-type=gpu', # blocks unpacked extension loading on Chrome 145+
'--disable-sync',
'--allow-legacy-extension-manifests',
'--allow-pre-commit-input',
@@ -430,14 +430,14 @@ class BrowserLaunchArgs(BaseModel):
if self.downloads_path is None:
import uuid
- # Create unique directory in /tmp for downloads
+ # Create unique directory in system temp folder for downloads
unique_id = str(uuid.uuid4())[:8] # 8 characters
- downloads_path = Path(f'/tmp/browser-use-downloads-{unique_id}')
+ downloads_path = Path(tempfile.gettempdir()) / f'browser-use-downloads-{unique_id}'
# Ensure path doesn't already exist (extremely unlikely but possible)
while downloads_path.exists():
unique_id = str(uuid.uuid4())[:8]
- downloads_path = Path(f'/tmp/browser-use-downloads-{unique_id}')
+ downloads_path = Path(tempfile.gettempdir()) / f'browser-use-downloads-{unique_id}'
self.downloads_path = downloads_path
self.downloads_path.mkdir(parents=True, exist_ok=True)
@@ -602,6 +602,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
default_factory=_get_enable_default_extensions_default,
description="Enable automation-optimized extensions: ad blocking (uBlock Origin), cookie handling (I still don't care about cookies), and URL cleaning (ClearURLs). All extensions work automatically without manual intervention. Extensions are automatically downloaded and loaded when enabled. Can be disabled via BROWSER_USE_DISABLE_EXTENSIONS=1 environment variable.",
)
+ captcha_solver: bool = Field(
+ default=True,
+ description='Enable the captcha solver watchdog that listens for captcha events from the browser proxy. Automatically pauses agent steps while a CAPTCHA is being solved. Only active when the browser emits BrowserUse CDP events (e.g. Browser Use cloud browsers). Harmless when disabled or when events are not emitted.',
+ )
demo_mode: bool = Field(
default=False,
description='Enable demo mode side panel that streams agent logs directly inside the browser window (requires headless=False).',
@@ -933,6 +937,25 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
return args
+ @staticmethod
+ def _check_extension_manifest_version(ext_dir: Path, ext_name: str) -> bool:
+ """Check that an extension uses Manifest V3. Returns False for MV2 extensions (unsupported by Chrome 145+)."""
+ import json
+
+ manifest_path = ext_dir / 'manifest.json'
+ if not manifest_path.exists():
+ return False
+ try:
+ with open(manifest_path, encoding='utf-8') as f:
+ manifest = json.load(f)
+ mv = manifest.get('manifest_version', 2)
+ if mv < 3:
+ logger.warning(f'Skipping {ext_name} extension: Manifest V{mv} is no longer supported by Chrome')
+ return False
+ return True
+ except Exception:
+ return False
+
def _ensure_default_extensions_downloaded(self) -> list[str]:
"""
Ensure default extensions are downloaded and cached locally.
@@ -940,23 +963,18 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
"""
# Extension definitions - optimized for automation and content extraction
- # Combines uBlock Origin (ad blocking) + "I still don't care about cookies" (cookie banner handling)
+ # uBlock Origin Lite (ad blocking, MV3) + "I still don't care about cookies" (cookie banner handling)
extensions = [
{
- 'name': 'uBlock Origin',
- 'id': 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
- 'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dcjpalhdlnbpafiamejdnhcphjbkeiagm%26uc',
+ 'name': 'uBlock Origin Lite',
+ 'id': 'ddkjiahejlhfcafbddmgiahcphecmpfh',
+ 'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dddkjiahejlhfcafbddmgiahcphecmpfh%26uc',
},
{
'name': "I still don't care about cookies",
'id': 'edibdbjcniadpccecjdfdjjppcpchdlm',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dedibdbjcniadpccecjdfdjjppcpchdlm%26uc',
},
- {
- 'name': 'ClearURLs',
- 'id': 'lckanjgmijmafbedllaakclkaicjfmnk',
- 'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dlckanjgmijmafbedllaakclkaicjfmnk%26uc',
- },
{
'name': 'Force Background Tab',
'id': 'gidlfommnbibbmegmgajdbikelkdcmcl',
@@ -994,7 +1012,8 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
# Check if extension is already extracted
if ext_dir.exists() and (ext_dir / 'manifest.json').exists():
- # logger.debug(f'✅ Using cached {ext["name"]} extension from {_log_pretty_path(ext_dir)}')
+ if not self._check_extension_manifest_version(ext_dir, ext['name']):
+ continue
extension_paths.append(str(ext_dir))
loaded_extension_names.append(ext['name'])
continue
@@ -1011,6 +1030,9 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
logger.info(f'📂 Extracting {ext["name"]} extension...')
self._extract_extension(crx_file, ext_dir)
+ if not self._check_extension_manifest_version(ext_dir, ext['name']):
+ continue
+
extension_paths.append(str(ext_dir))
loaded_extension_names.append(ext['name'])
@@ -1149,7 +1171,6 @@ async function initialize(checkInitialized, magic) {{
zip_data = f.read()
# Write ZIP data to temp file and extract
- import tempfile
with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
temp_zip.write(zip_data)
diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
index 56ac6cae1..b6dbb88e2 100644
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -2,6 +2,8 @@
import asyncio
import logging
+import re
+import time
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal, Self, Union, cast, overload
@@ -13,7 +15,8 @@ from bubus import EventBus
from cdp_use import CDPClient
from cdp_use.cdp.fetch import AuthRequiredEvent, RequestPausedEvent
from cdp_use.cdp.network import Cookie
-from cdp_use.cdp.target import AttachedToTargetEvent, SessionID, TargetID
+from cdp_use.cdp.target import SessionID, TargetID
+from cdp_use.cdp.target.commands import CreateTargetParameters
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
from uuid_extensions import uuid7str
@@ -28,6 +31,8 @@ from browser_use.browser.events import (
BrowserErrorEvent,
BrowserLaunchEvent,
BrowserLaunchResult,
+ BrowserReconnectedEvent,
+ BrowserReconnectingEvent,
BrowserStartEvent,
BrowserStateRequestEvent,
BrowserStopEvent,
@@ -50,6 +55,7 @@ from browser_use.utils import _log_pretty_url, create_task_with_error_handling,
if TYPE_CHECKING:
from browser_use.actor.page import Page
from browser_use.browser.demo_mode import DemoMode
+ from browser_use.browser.watchdogs.captcha_watchdog import CaptchaWaitResult
DEFAULT_BROWSER_PROFILE = BrowserProfile()
@@ -145,6 +151,7 @@ class BrowserSession(BaseModel):
minimum_wait_page_load_time: float | None = None,
wait_for_network_idle_page_load_time: float | None = None,
wait_between_actions: float | None = None,
+ captcha_solver: bool | None = None,
auto_download_pdfs: bool | None = None,
cookie_whitelist_domains: list[str] | None = None,
cross_origin_iframes: bool | None = None,
@@ -211,6 +218,7 @@ class BrowserSession(BaseModel):
deterministic_rendering: bool | None = None,
proxy: ProxySettings | None = None,
enable_default_extensions: bool | None = None,
+ captcha_solver: bool | None = None,
window_size: dict | None = None,
window_position: dict | None = None,
filter_highlight_ids: bool | None = None,
@@ -277,6 +285,7 @@ class BrowserSession(BaseModel):
keep_alive: bool | None = None,
proxy: ProxySettings | None = None,
enable_default_extensions: bool | None = None,
+ captcha_solver: bool | None = None,
window_size: dict | None = None,
window_position: dict | None = None,
minimum_wait_page_load_time: float | None = None,
@@ -379,6 +388,57 @@ class BrowserSession(BaseModel):
# Cache of original viewport size for coordinate conversion (set when browser state is captured)
_original_viewport_size: tuple[int, int] | None = PrivateAttr(default=None)
+ @classmethod
+ def from_system_chrome(cls, profile_directory: str | None = None, **kwargs: Any) -> Self:
+ """Create a BrowserSession using system's Chrome installation and profile"""
+ from browser_use.skill_cli.utils import find_chrome_executable, get_chrome_profile_path, list_chrome_profiles
+
+ executable_path = find_chrome_executable()
+ if executable_path is None:
+ raise RuntimeError(
+ 'Chrome not found. Please install Chrome or use Browser() with explicit executable_path.\n'
+ 'Expected locations:\n'
+ ' macOS: /Applications/Google Chrome.app/Contents/MacOS/Google Chrome\n'
+ ' Linux: /usr/bin/google-chrome or /usr/bin/chromium\n'
+ ' Windows: C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
+ )
+
+ user_data_dir = get_chrome_profile_path(None)
+ if user_data_dir is None:
+ raise RuntimeError(
+ 'Could not detect Chrome profile directory for your platform.\n'
+ 'Expected locations:\n'
+ ' macOS: ~/Library/Application Support/Google/Chrome\n'
+ ' Linux: ~/.config/google-chrome or ~/.config/chromium\n'
+ ' Windows: %LocalAppData%\\Google\\Chrome\\User Data'
+ )
+
+ # Auto-select profile if not specified
+ profiles = list_chrome_profiles()
+ if profile_directory is None:
+ if profiles:
+ # Use first available profile
+ profile_directory = profiles[0]['directory']
+ logging.getLogger('browser_use').info(
+ f'Auto-selected Chrome profile: {profiles[0]["name"]} ({profile_directory})'
+ )
+ else:
+ profile_directory = 'Default'
+
+ return cls(
+ executable_path=executable_path,
+ user_data_dir=user_data_dir,
+ profile_directory=profile_directory,
+ **kwargs,
+ )
+
+ @classmethod
+ def list_chrome_profiles(cls) -> list[dict[str, str]]:
+ """List available Chrome profiles on the system"""
+ from browser_use.skill_cli.utils import list_chrome_profiles
+
+ return list_chrome_profiles()
+
# Convenience properties for common browser settings
@property
def cdp_url(self) -> str | None:
@@ -390,6 +450,38 @@ class BrowserSession(BaseModel):
"""Whether this is a local browser instance from browser profile."""
return self.browser_profile.is_local
+ @property
+ def is_cdp_connected(self) -> bool:
+ """Check if the CDP WebSocket connection is alive and usable.
+
+ Returns True only if the root CDP client exists and its WebSocket is in OPEN state.
+ A dead/closing/closed WebSocket returns False, preventing handlers from dispatching
+ CDP commands that would hang until timeout on a broken connection.
+ """
+ if self._cdp_client_root is None or self._cdp_client_root.ws is None:
+ return False
+ try:
+ from websockets.protocol import State
+
+ return self._cdp_client_root.ws.state is State.OPEN
+ except Exception:
+ return False
+
+ async def wait_if_captcha_solving(self, timeout: float | None = None) -> 'CaptchaWaitResult | None':
+ """Wait if a captcha is currently being solved by the browser proxy.
+
+ Returns:
+ A CaptchaWaitResult if we had to wait, or None if no captcha was in progress.
+ """
+ if self._captcha_watchdog is not None:
+ return await self._captcha_watchdog.wait_if_captcha_solving(timeout=timeout)
+ return None
+
+ @property
+ def is_reconnecting(self) -> bool:
+ """Whether a WebSocket reconnection attempt is currently in progress."""
+ return self._reconnecting
+
@property
def cloud_browser(self) -> bool:
"""Whether to use cloud browser service from browser profile."""
@@ -436,10 +528,22 @@ class BrowserSession(BaseModel):
_screenshot_watchdog: Any | None = PrivateAttr(default=None)
_permissions_watchdog: Any | None = PrivateAttr(default=None)
_recording_watchdog: Any | None = PrivateAttr(default=None)
+ _captcha_watchdog: Any | None = PrivateAttr(default=None)
+ _watchdogs_attached: bool = PrivateAttr(default=False)
_cloud_browser_client: CloudBrowserClient = PrivateAttr(default_factory=lambda: CloudBrowserClient())
_demo_mode: 'DemoMode | None' = PrivateAttr(default=None)
+ # WebSocket reconnection state
+ # Max wait = attempts * timeout_per_attempt + sum(delays) + small buffer
+ # Default: 3 * 15s + (1+2+4)s + 2s = 54s
+ RECONNECT_WAIT_TIMEOUT: float = 54.0
+ _reconnecting: bool = PrivateAttr(default=False)
+ _reconnect_event: asyncio.Event = PrivateAttr(default_factory=asyncio.Event)
+ _reconnect_lock: asyncio.Lock = PrivateAttr(default_factory=asyncio.Lock)
+ _reconnect_task: asyncio.Task | None = PrivateAttr(default=None)
+ _intentional_stop: bool = PrivateAttr(default=False)
+
_logger: Any = PrivateAttr(default=None)
@property
@@ -476,6 +580,15 @@ class BrowserSession(BaseModel):
async def reset(self) -> None:
"""Clear all cached CDP sessions with proper cleanup."""
+ # Suppress auto-reconnect callback during teardown
+ self._intentional_stop = True
+ # Cancel any in-flight reconnection task
+ if self._reconnect_task and not self._reconnect_task.done():
+ self._reconnect_task.cancel()
+ self._reconnect_task = None
+ self._reconnecting = False
+ self._reconnect_event.set() # unblock any waiters
+
cdp_status = 'connected' if self._cdp_client_root else 'not connected'
session_mgr_status = 'exists' if self.session_manager else 'None'
self.logger.debug(
@@ -516,15 +629,21 @@ class BrowserSession(BaseModel):
self._screenshot_watchdog = None
self._permissions_watchdog = None
self._recording_watchdog = None
+ self._captcha_watchdog = None
+ self._watchdogs_attached = False
if self._demo_mode:
self._demo_mode.reset()
self._demo_mode = None
+ self._intentional_stop = False
self.logger.info('✅ Browser session reset complete')
def model_post_init(self, __context) -> None:
"""Register event handlers after model initialization."""
self._connection_lock = asyncio.Lock()
+ # Initialize reconnect event as set (no reconnection pending)
+ self._reconnect_event = asyncio.Event()
+ self._reconnect_event.set()
# Check if handlers are already registered to prevent duplicates
from browser_use.browser.watchdog_base import BaseWatchdog
@@ -559,6 +678,7 @@ class BrowserSession(BaseModel):
async def kill(self) -> None:
"""Kill the browser session and reset all state."""
+ self._intentional_stop = True
self.logger.debug('🛑 kill() called - stopping browser with force=True and resetting state')
# First save storage state while CDP is still connected
@@ -582,6 +702,7 @@ class BrowserSession(BaseModel):
This clears event buses and cached state but keeps the browser alive.
Useful when you want to clean up resources but plan to reconnect later.
"""
+ self._intentional_stop = True
self.logger.debug('⏸️ stop() called - stopping browser gracefully (force=False) and resetting state')
# First save storage state while CDP is still connected
@@ -600,6 +721,10 @@ class BrowserSession(BaseModel):
# Create fresh event bus
self.event_bus = EventBus()
+ async def close(self) -> None:
+ """Alias for stop()."""
+ await self.stop()
+
@observe_debug(ignore_input=True, ignore_output=True, name='browser_start_event_handler')
async def on_BrowserStartEvent(self, event: BrowserStartEvent) -> dict[str, str]:
"""Handle browser start request.
@@ -628,9 +753,7 @@ class BrowserSession(BaseModel):
self.browser_profile.is_local = False
self.logger.info('🌤️ Successfully connected to cloud browser service')
except CloudBrowserAuthError:
- raise CloudBrowserAuthError(
- 'Authentication failed for cloud browser service. Set BROWSER_USE_API_KEY environment variable. You can also create an API key at https://cloud.browser-use.com/new-api-key'
- )
+ raise
except CloudBrowserError as e:
raise CloudBrowserError(f'Failed to create cloud browser: {e}')
elif self.is_local:
@@ -653,11 +776,39 @@ class BrowserSession(BaseModel):
# Only connect if not already connected
if self._cdp_client_root is None:
# Setup browser via CDP (for both local and remote cases)
- await self.connect(cdp_url=self.cdp_url)
+ # Global timeout prevents connect() from hanging indefinitely on
+ # slow/broken WebSocket connections (common on Lambda → remote browser)
+ try:
+ await asyncio.wait_for(self.connect(cdp_url=self.cdp_url), timeout=15.0)
+ except TimeoutError:
+ # Timeout cancels connect() via CancelledError, which bypasses
+ # connect()'s `except Exception` cleanup (CancelledError is BaseException).
+ # Clean up the partially-initialized client so future start attempts
+ # don't skip reconnection due to _cdp_client_root being non-None.
+ cdp_client = cast(CDPClient | None, self._cdp_client_root)
+ if cdp_client is not None:
+ try:
+ await cdp_client.stop()
+ except Exception:
+ pass
+ self._cdp_client_root = None
+ manager = self.session_manager
+ if manager is not None:
+ try:
+ await manager.clear()
+ except Exception:
+ pass
+ self.session_manager = None
+ self.agent_focus_target_id = None
+ raise RuntimeError(
+ f'connect() timed out after 15s — CDP connection to {self.cdp_url} is too slow or unresponsive'
+ )
assert self.cdp_client is not None
# Notify that browser is connected (single place)
- self.event_bus.dispatch(BrowserConnectedEvent(cdp_url=self.cdp_url))
+ # Ensure BrowserConnected handlers (storage_state restore) complete before
+ # start() returns so cookies/storage are applied before navigation.
+ await self.event_bus.dispatch(BrowserConnectedEvent(cdp_url=self.cdp_url))
if self.browser_profile.demo_mode:
try:
@@ -687,6 +838,11 @@ class BrowserSession(BaseModel):
details={'cdp_url': self.cdp_url, 'is_local': self.is_local},
)
)
+ if self.is_local and not isinstance(e, (CloudBrowserAuthError, CloudBrowserError)):
+ self.logger.warning(
+ 'Local browser failed to start. Cloud browsers require no local install and work out of the box.\n'
+ ' Try: Browser(use_cloud=True) | Get an API key: https://cloud.browser-use.com?utm_source=oss&utm_medium=browser_launch_failure'
+ )
raise
async def on_NavigateToUrlEvent(self, event: NavigateToUrlEvent) -> None:
@@ -756,7 +912,13 @@ class BrowserSession(BaseModel):
await self.event_bus.dispatch(NavigationStartedEvent(target_id=target_id, url=event.url))
# Navigate to URL with proper lifecycle waiting
- await self._navigate_and_wait(event.url, target_id)
+ await self._navigate_and_wait(
+ event.url,
+ target_id,
+ timeout=event.timeout_ms / 1000 if event.timeout_ms is not None else None,
+ wait_until=event.wait_until,
+ nav_timeout=event.event_timeout,
+ )
# Close any extension options pages that might have opened
await self._close_extension_options_pages()
@@ -793,17 +955,19 @@ class BrowserSession(BaseModel):
await self.event_bus.dispatch(AgentFocusChangedEvent(target_id=target_id, url=event.url))
raise
- async def _navigate_and_wait(self, url: str, target_id: str, timeout: float | None = None) -> None:
+ async def _navigate_and_wait(
+ self,
+ url: str,
+ target_id: str,
+ timeout: float | None = None,
+ wait_until: str = 'load',
+ nav_timeout: float | None = None,
+ ) -> None:
"""Navigate to URL and wait for page readiness using CDP lifecycle events.
- Two-strategy approach optimized for speed with robust fallback:
- 1. networkIdle - Returns ASAP when no network activity (~50-200ms for cached pages)
- 2. load - Fallback when page has ongoing network activity (all resources loaded)
-
- This gives us instant returns for cached content while being robust for dynamic pages.
-
- NO handler registration here - handlers are registered ONCE per session in SessionManager.
- We poll stored events instead to avoid handler accumulation.
+ Polls stored lifecycle events (registered once per session in SessionManager).
+ wait_until controls the minimum acceptable signal: 'commit', 'domcontentloaded', 'load', 'networkidle'.
+ nav_timeout controls the timeout for the CDP Page.navigate() call itself (defaults to 20.0s).
"""
cdp_session = await self.get_or_create_cdp_session(target_id, focus=False)
@@ -815,28 +979,38 @@ class BrowserSession(BaseModel):
if url.startswith('http') and current_url.startswith('http')
else False
)
- timeout = 2.0 if same_domain else 4.0
+ timeout = 3.0 if same_domain else 8.0
- # Start performance tracking
nav_start_time = asyncio.get_event_loop().time()
- nav_result = await cdp_session.cdp_client.send.Page.navigate(
- params={'url': url, 'transitionType': 'address_bar'},
- session_id=cdp_session.session_id,
- )
+ # Wrap Page.navigate() with timeout — heavy sites can block here for 10s+
+ # Use nav_timeout parameter if provided, otherwise default to 20.0
+ if nav_timeout is None:
+ nav_timeout = 20.0
+ try:
+ nav_result = await asyncio.wait_for(
+ cdp_session.cdp_client.send.Page.navigate(
+ params={'url': url, 'transitionType': 'address_bar'},
+ session_id=cdp_session.session_id,
+ ),
+ timeout=nav_timeout,
+ )
+ except TimeoutError:
+ duration_ms = (asyncio.get_event_loop().time() - nav_start_time) * 1000
+ raise RuntimeError(f'Page.navigate() timed out after {nav_timeout}s ({duration_ms:.0f}ms) for {url}')
- # Check for immediate navigation errors
if nav_result.get('errorText'):
raise RuntimeError(f'Navigation failed: {nav_result["errorText"]}')
- # Track this specific navigation
+ if wait_until == 'commit':
+ duration_ms = (asyncio.get_event_loop().time() - nav_start_time) * 1000
+ self.logger.debug(f'✅ Page ready for {url} (commit, {duration_ms:.0f}ms)')
+ return
+
navigation_id = nav_result.get('loaderId')
start_time = asyncio.get_event_loop().time()
+ seen_events = []
- # Poll stored lifecycle events
- seen_events = [] # Track events for timeout diagnostics
-
- # Check if session has lifecycle monitoring enabled
if not hasattr(cdp_session, '_lifecycle_events'):
raise RuntimeError(
f'❌ Lifecycle monitoring not enabled for {cdp_session.target_id[:8]}! '
@@ -844,42 +1018,37 @@ class BrowserSession(BaseModel):
f'Session: {cdp_session}'
)
- # Poll for lifecycle events until timeout
- poll_interval = 0.05 # Poll every 50ms
+ # Acceptable events by readiness level (higher is always acceptable)
+ acceptable_events: set[str] = {'networkIdle'}
+ if wait_until in ('load', 'domcontentloaded'):
+ acceptable_events.add('load')
+ if wait_until == 'domcontentloaded':
+ acceptable_events.add('DOMContentLoaded')
+
+ poll_interval = 0.05
while (asyncio.get_event_loop().time() - start_time) < timeout:
- # Check stored events
try:
- # Get recent events matching our navigation
for event_data in list(cdp_session._lifecycle_events):
event_name = event_data.get('name')
event_loader_id = event_data.get('loaderId')
- # Track events
event_str = f'{event_name}(loader={event_loader_id[:8] if event_loader_id else "none"})'
if event_str not in seen_events:
seen_events.append(event_str)
- # Only respond to events from our navigation (or accept all if no loaderId)
if event_loader_id and navigation_id and event_loader_id != navigation_id:
continue
- if event_name == 'networkIdle':
+ if event_name in acceptable_events:
duration_ms = (asyncio.get_event_loop().time() - nav_start_time) * 1000
- self.logger.debug(f'✅ Page ready for {url} (networkIdle, {duration_ms:.0f}ms)')
- return
-
- elif event_name == 'load':
- duration_ms = (asyncio.get_event_loop().time() - nav_start_time) * 1000
- self.logger.debug(f'✅ Page ready for {url} (load, {duration_ms:.0f}ms)')
+ self.logger.debug(f'✅ Page ready for {url} ({event_name}, {duration_ms:.0f}ms)')
return
except Exception as e:
self.logger.debug(f'Error polling lifecycle events: {e}')
- # Wait before next poll
await asyncio.sleep(poll_interval)
- # Timeout - continue anyway with detailed diagnostics
duration_ms = (asyncio.get_event_loop().time() - nav_start_time) * 1000
if not seen_events:
self.logger.error(
@@ -1028,6 +1197,14 @@ class BrowserSession(BaseModel):
else:
self.logger.debug(f'File already tracked: {event.path}')
+ def _cloud_session_id_from_cdp_url(self) -> str | None:
+ """Derive cloud browser session ID from a Browser Use CDP URL."""
+ if not self.cdp_url:
+ return None
+ host = urlparse(self.cdp_url).hostname or ''
+ match = re.match(r'^([0-9a-fA-F-]{36})\.cdp\d+\.browser-use\.com$', host)
+ return match.group(1) if match else None
+
async def on_BrowserStopEvent(self, event: BrowserStopEvent) -> None:
"""Handle browser stop request."""
@@ -1037,13 +1214,22 @@ class BrowserSession(BaseModel):
self.event_bus.dispatch(BrowserStoppedEvent(reason='Kept alive due to keep_alive=True'))
return
- # Clean up cloud browser session if using cloud browser
- if self.browser_profile.use_cloud:
+ # Clean up cloud browser session for both:
+ # 1) native use_cloud sessions (current_session_id set by create_browser)
+ # 2) reconnected cdp_url sessions (derive UUID from host)
+ cloud_session_id = self._cloud_browser_client.current_session_id or self._cloud_session_id_from_cdp_url()
+ if cloud_session_id:
try:
- await self._cloud_browser_client.stop_browser()
- self.logger.info('🌤️ Cloud browser session cleaned up')
+ await self._cloud_browser_client.stop_browser(cloud_session_id)
+ self.logger.info(f'🌤️ Cloud browser session cleaned up: {cloud_session_id}')
except Exception as e:
- self.logger.debug(f'Failed to cleanup cloud browser session: {e}')
+ self.logger.debug(f'Failed to cleanup cloud browser session {cloud_session_id}: {e}')
+ finally:
+ # Always close the httpx client to free connection pool memory
+ try:
+ await self._cloud_browser_client.close()
+ except Exception:
+ pass
# Clear CDP session cache before stopping
self.logger.info(
@@ -1207,7 +1393,7 @@ class BrowserSession(BaseModel):
output_file = Path(output_path).expanduser().resolve()
output_file.parent.mkdir(parents=True, exist_ok=True)
- output_file.write_text(json.dumps(storage_state, indent=2))
+ output_file.write_text(json.dumps(storage_state, indent=2, ensure_ascii=False), encoding='utf-8')
self.logger.info(f'💾 Exported {len(cookies)} cookies to {output_file}')
return storage_state
@@ -1288,15 +1474,41 @@ class BrowserSession(BaseModel):
f'(agent_focus stays on {current_focus}...)'
)
- # Resume if waiting for debugger
+ # Resume if waiting for debugger (non-essential, don't let it block connect)
if focus:
try:
- await session.cdp_client.send.Runtime.runIfWaitingForDebugger(session_id=session.session_id)
+ await asyncio.wait_for(
+ session.cdp_client.send.Runtime.runIfWaitingForDebugger(session_id=session.session_id),
+ timeout=3.0,
+ )
except Exception:
- pass # May fail if not waiting
+ pass # May fail if not waiting, or timeout — either is fine
return session
+ async def set_extra_headers(self, headers: dict[str, str], target_id: TargetID | None = None) -> None:
+ """Set extra HTTP headers using CDP Network.setExtraHTTPHeaders.
+
+ These headers will be sent with every HTTP request made by the target.
+ Network domain must be enabled first (done automatically for page targets
+ in SessionManager._enable_page_monitoring).
+
+ Args:
+ headers: Dictionary of header name -> value pairs to inject into every request.
+ target_id: Target to set headers on. Defaults to the current agent focus target.
+ """
+ if target_id is None:
+ if not self.agent_focus_target_id:
+ return
+ target_id = self.agent_focus_target_id
+
+ cdp_session = await self.get_or_create_cdp_session(target_id, focus=False)
+ # Ensure Network domain is enabled (idempotent - safe to call multiple times)
+ await cdp_session.cdp_client.send.Network.enable(session_id=cdp_session.session_id)
+ await cdp_session.cdp_client.send.Network.setExtraHTTPHeaders(
+ params={'headers': cast(Any, headers)}, session_id=cdp_session.session_id
+ )
+
# endregion - ========== CDP-based ... ==========
# region - ========== Helper Methods ==========
@@ -1349,11 +1561,12 @@ class BrowserSession(BaseModel):
async def attach_all_watchdogs(self) -> None:
"""Initialize and attach all watchdogs with explicit handler registration."""
# Prevent duplicate watchdog attachment
- if hasattr(self, '_watchdogs_attached') and self._watchdogs_attached:
+ if self._watchdogs_attached:
self.logger.debug('Watchdogs already attached, skipping duplicate attachment')
return
from browser_use.browser.watchdogs.aboutblank_watchdog import AboutBlankWatchdog
+ from browser_use.browser.watchdogs.captcha_watchdog import CaptchaWatchdog
# from browser_use.browser.crash_watchdog import CrashWatchdog
from browser_use.browser.watchdogs.default_action_watchdog import DefaultActionWatchdog
@@ -1487,6 +1700,12 @@ class BrowserSession(BaseModel):
self._har_recording_watchdog = HarRecordingWatchdog(event_bus=self.event_bus, browser_session=self)
self._har_recording_watchdog.attach_to_session()
+ # Initialize CaptchaWatchdog (listens for captcha solver events from the browser proxy)
+ if self.browser_profile.captcha_solver:
+ CaptchaWatchdog.model_rebuild()
+ self._captcha_watchdog = CaptchaWatchdog(event_bus=self.event_bus, browser_session=self)
+ self._captcha_watchdog.attach_to_session()
+
# Mark watchdogs as attached to prevent duplicate attachment
self._watchdogs_attached = True
@@ -1524,8 +1743,17 @@ class BrowserSession(BaseModel):
)
# Run a tiny HTTP client to query for the WebSocket URL from the /json/version endpoint
- async with httpx.AsyncClient() as client:
- headers = self.browser_profile.headers or {}
+ # Default httpx timeout is 5s which can race the global wait_for(connect(), 15s).
+ # Use 30s as a safety net for direct connect() callers; the wait_for is the real deadline.
+ # For localhost/127.0.0.1, disable trust_env to prevent proxy env vars (HTTP_PROXY, HTTPS_PROXY)
+ # from routing local requests through a proxy, which causes 502 errors on Windows.
+ # Remote CDP URLs should still respect proxy settings.
+ is_localhost = parsed_url.hostname in ('localhost', '127.0.0.1', '::1')
+ async with httpx.AsyncClient(timeout=httpx.Timeout(30.0), trust_env=not is_localhost) as client:
+ headers = dict(self.browser_profile.headers or {})
+ from browser_use.utils import get_browser_use_version
+
+ headers.setdefault('User-Agent', f'browser-use/{get_browser_use_version()}')
version_info = await client.get(url, headers=headers)
self.logger.debug(f'Raw version info: {str(version_info)}')
self.browser_profile.cdp_url = version_info.json()['webSocketDebuggerUrl']
@@ -1537,10 +1765,14 @@ class BrowserSession(BaseModel):
try:
# Create and store the CDP client for direct CDP communication
- headers = getattr(self.browser_profile, 'headers', None)
+ headers = dict(getattr(self.browser_profile, 'headers', None) or {})
+ if not self.is_local:
+ from browser_use.utils import get_browser_use_version
+
+ headers.setdefault('User-Agent', f'browser-use/{get_browser_use_version()}')
self._cdp_client_root = CDPClient(
self.cdp_url,
- additional_headers=headers,
+ additional_headers=headers or None,
max_ws_frame_size=200 * 1024 * 1024, # Use 200MB limit to handle pages with very large DOMs
)
assert self._cdp_client_root is not None
@@ -1569,22 +1801,27 @@ class BrowserSession(BaseModel):
# SessionManager has already discovered all targets via start_monitoring()
page_targets_from_manager = self.session_manager.get_all_page_targets()
- # Check for chrome://newtab pages and redirect them to about:blank
+ # Check for chrome://newtab pages and redirect them to about:blank (in parallel)
from browser_use.utils import is_new_tab_page
- for target in page_targets_from_manager:
+ async def _redirect_newtab(target):
target_url = target.url
- if is_new_tab_page(target_url) and target_url != 'about:blank':
- target_id = target.target_id
- self.logger.debug(f'🔄 Redirecting {target_url} to about:blank for target {target_id}')
- try:
- # Use public API with focus=False to avoid changing focus during init
- session = await self.get_or_create_cdp_session(target_id, focus=False)
- await session.cdp_client.send.Page.navigate(params={'url': 'about:blank'}, session_id=session.session_id)
- # Update target url
- target.url = 'about:blank'
- except Exception as e:
- self.logger.warning(f'Failed to redirect {target_url}: {e}')
+ target_id = target.target_id
+ self.logger.debug(f'🔄 Redirecting {target_url} to about:blank for target {target_id}')
+ try:
+ session = await self.get_or_create_cdp_session(target_id, focus=False)
+ await session.cdp_client.send.Page.navigate(params={'url': 'about:blank'}, session_id=session.session_id)
+ target.url = 'about:blank'
+ except Exception as e:
+ self.logger.warning(f'Failed to redirect {target_url}: {e}')
+
+ redirect_tasks = [
+ _redirect_newtab(target)
+ for target in page_targets_from_manager
+ if is_new_tab_page(target.url) and target.url != 'about:blank'
+ ]
+ if redirect_tasks:
+ await asyncio.gather(*redirect_tasks, return_exceptions=True)
# Ensure we have at least one page
if not page_targets_from_manager:
@@ -1610,6 +1847,10 @@ class BrowserSession(BaseModel):
# Enable proxy authentication handling if configured
await self._setup_proxy_auth()
+ # Attach WS drop detection callback for auto-reconnection
+ self._intentional_stop = False
+ self._attach_ws_drop_callback()
+
# Verify the target is working
if self.agent_focus_target_id:
target = self.session_manager.get_target(self.agent_focus_target_id)
@@ -1774,33 +2015,6 @@ class BrowserSession(BaseModel):
except Exception as e:
self.logger.debug(f'Failed to register authRequired handlers: {type(e).__name__}: {e}')
- # Auto-enable Fetch on every newly attached target to ensure auth callbacks fire
- def _on_attached(event: AttachedToTargetEvent, session_id: SessionID | None = None):
- sid = event.get('sessionId') or event.get('session_id') or session_id
- if not sid:
- return
-
- async def _enable():
- assert self._cdp_client_root
- try:
- await self._cdp_client_root.send.Fetch.enable(
- params={'handleAuthRequests': True},
- session_id=sid,
- )
- self.logger.debug(f'Fetch.enable(handleAuthRequests=True) enabled on attached session {sid}')
- except Exception as e:
- self.logger.debug(f'Fetch.enable on attached session failed: {type(e).__name__}: {e}')
-
- create_task_with_error_handling(
- _enable(), name='fetch_enable_attached', logger_instance=self.logger, suppress_exceptions=True
- )
-
- try:
- self._cdp_client_root.register.Target.attachedToTarget(_on_attached)
- self.logger.debug('Registered Target.attachedToTarget handler for Fetch.enable')
- except Exception as e:
- self.logger.debug(f'Failed to register attachedToTarget handler: {type(e).__name__}: {e}')
-
# Ensure Fetch is enabled for the current focused target's session, too
try:
if self.agent_focus_target_id:
@@ -1815,6 +2029,183 @@ class BrowserSession(BaseModel):
except Exception as e:
self.logger.debug(f'Skipping proxy auth setup: {type(e).__name__}: {e}')
+ async def reconnect(self) -> None:
+ """Re-establish the CDP WebSocket connection to an already-running browser.
+
+ This is a lightweight reconnection that:
+ 1. Stops the old CDPClient (WS already dead, just clean state)
+ 2. Clears SessionManager (all CDP sessions are invalid post-disconnect)
+ 3. Creates a new CDPClient with the same cdp_url
+ 4. Re-initializes SessionManager and re-enables autoAttach
+ 5. Re-discovers page targets and restores agent focus
+ 6. Re-enables proxy auth if configured
+ """
+ assert self.cdp_url, 'Cannot reconnect without a CDP URL'
+
+ old_focus_target_id = self.agent_focus_target_id
+
+ # 1. Stop old CDPClient (WS is already dead, this just cleans internal state)
+ if self._cdp_client_root:
+ try:
+ await self._cdp_client_root.stop()
+ except Exception as e:
+ self.logger.debug(f'Error stopping old CDP client during reconnect: {e}')
+ self._cdp_client_root = None
+
+ # 2. Clear SessionManager (all sessions are stale)
+ if self.session_manager:
+ try:
+ await self.session_manager.clear()
+ except Exception as e:
+ self.logger.debug(f'Error clearing SessionManager during reconnect: {e}')
+ self.session_manager = None
+
+ self.agent_focus_target_id = None
+
+ # 3. Create new CDPClient with the same cdp_url
+ headers = dict(getattr(self.browser_profile, 'headers', None) or {})
+ if not self.is_local:
+ from browser_use.utils import get_browser_use_version
+
+ headers.setdefault('User-Agent', f'browser-use/{get_browser_use_version()}')
+ self._cdp_client_root = CDPClient(
+ self.cdp_url,
+ additional_headers=headers or None,
+ max_ws_frame_size=200 * 1024 * 1024,
+ )
+ await self._cdp_client_root.start()
+
+ # 4. Re-initialize SessionManager
+ from browser_use.browser.session_manager import SessionManager
+
+ self.session_manager = SessionManager(self)
+ await self.session_manager.start_monitoring()
+
+ # 5. Re-enable autoAttach
+ await self._cdp_client_root.send.Target.setAutoAttach(
+ params={'autoAttach': True, 'waitForDebuggerOnStart': False, 'flatten': True}
+ )
+
+ # 6. Re-discover page targets and restore focus
+ page_targets = self.session_manager.get_all_page_targets()
+
+ # Prefer the old focus target if it still exists
+ restored = False
+ if old_focus_target_id:
+ for target in page_targets:
+ if target.target_id == old_focus_target_id:
+ await self.get_or_create_cdp_session(old_focus_target_id, focus=True)
+ restored = True
+ self.logger.debug(f'🔄 Restored agent focus to previous target {old_focus_target_id[:8]}...')
+ break
+
+ if not restored:
+ if page_targets:
+ fallback_id = page_targets[0].target_id
+ await self.get_or_create_cdp_session(fallback_id, focus=True)
+ self.logger.debug(f'🔄 Agent focus set to fallback target {fallback_id[:8]}...')
+ else:
+ # No pages exist — create one
+ new_target = await self._cdp_client_root.send.Target.createTarget(params={'url': 'about:blank'})
+ target_id = new_target['targetId']
+ await self.get_or_create_cdp_session(target_id, focus=True)
+ self.logger.debug(f'🔄 Created new blank page during reconnect: {target_id[:8]}...')
+
+ # 7. Re-enable proxy auth if configured
+ await self._setup_proxy_auth()
+
+ # 8. Attach the WS drop detection callback to the new client
+ self._attach_ws_drop_callback()
+
+ async def _auto_reconnect(self, max_attempts: int = 3) -> None:
+ """Attempt to reconnect with exponential backoff.
+
+ Dispatches BrowserReconnectingEvent before each attempt and
+ BrowserReconnectedEvent on success.
+ """
+ async with self._reconnect_lock:
+ if self._reconnecting:
+ return # already in progress from another caller
+ self._reconnecting = True
+ self._reconnect_event.clear()
+
+ start_time = time.time()
+ delays = [1.0, 2.0, 4.0]
+
+ try:
+ for attempt in range(1, max_attempts + 1):
+ self.event_bus.dispatch(
+ BrowserReconnectingEvent(
+ cdp_url=self.cdp_url or '',
+ attempt=attempt,
+ max_attempts=max_attempts,
+ )
+ )
+ self.logger.warning(f'🔄 WebSocket reconnection attempt {attempt}/{max_attempts}...')
+
+ try:
+ await asyncio.wait_for(self.reconnect(), timeout=15.0)
+ # Success
+ downtime = time.time() - start_time
+ self.event_bus.dispatch(
+ BrowserReconnectedEvent(
+ cdp_url=self.cdp_url or '',
+ attempt=attempt,
+ downtime_seconds=downtime,
+ )
+ )
+ self.logger.info(f'🔄 WebSocket reconnected after {downtime:.1f}s (attempt {attempt})')
+ return
+ except Exception as e:
+ self.logger.warning(f'🔄 Reconnection attempt {attempt} failed: {type(e).__name__}: {e}')
+ if attempt < max_attempts:
+ delay = delays[attempt - 1] if attempt - 1 < len(delays) else delays[-1]
+ await asyncio.sleep(delay)
+
+ # All attempts exhausted
+ self.logger.error(f'🔄 All {max_attempts} reconnection attempts failed')
+ self.event_bus.dispatch(
+ BrowserErrorEvent(
+ error_type='ReconnectionFailed',
+ message=f'Failed to reconnect after {max_attempts} attempts ({time.time() - start_time:.1f}s)',
+ details={'cdp_url': self.cdp_url or '', 'max_attempts': max_attempts},
+ )
+ )
+ finally:
+ self._reconnecting = False
+ self._reconnect_event.set() # wake up all waiters regardless of outcome
+
+ def _attach_ws_drop_callback(self) -> None:
+ """Attach a done callback to the CDPClient's message handler task to detect WS drops."""
+ if not self._cdp_client_root or not hasattr(self._cdp_client_root, '_message_handler_task'):
+ return
+
+ task = self._cdp_client_root._message_handler_task
+ if task is None or task.done():
+ return
+
+ def _on_message_handler_done(fut: asyncio.Future) -> None:
+ # Guard: skip if intentionally stopped, already reconnecting, or no cdp_url
+ if self._intentional_stop or self._reconnecting or not self.cdp_url:
+ return
+
+ # The message handler task exiting means the WS connection dropped
+ exc = fut.exception() if not fut.cancelled() else None
+ self.logger.warning(
+ f'🔌 CDP WebSocket message handler exited unexpectedly'
+ f'{f": {type(exc).__name__}: {exc}" if exc else " (connection closed)"}'
+ )
+
+ # Fire auto-reconnect as an asyncio task
+ try:
+ loop = asyncio.get_running_loop()
+ self._reconnect_task = loop.create_task(self._auto_reconnect())
+ except RuntimeError:
+ # No running event loop — can't reconnect
+ self.logger.error('🔌 No event loop available for auto-reconnect')
+
+ task.add_done_callback(_on_message_handler_done)
+
async def get_tabs(self) -> list[TabInfo]:
"""Get information about all open tabs using cached target data."""
tabs = []
@@ -2126,6 +2517,62 @@ class BrowserSession(BaseModel):
and element.attributes.get('type', '').lower() == 'file'
)
+ def find_file_input_near_element(
+ self,
+ node: 'EnhancedDOMTreeNode',
+ max_height: int = 3,
+ max_descendant_depth: int = 3,
+ ) -> 'EnhancedDOMTreeNode | None':
+ """Find the closest file input to the given element.
+
+ Walks up the DOM tree (up to max_height levels), checking the node itself,
+ its descendants (up to max_descendant_depth deep), and siblings at each level.
+
+ Args:
+ node: Starting DOM element
+ max_height: Maximum levels to walk up the parent chain
+ max_descendant_depth: Maximum depth to search descendants
+
+ Returns:
+ The nearest file input element, or None if not found
+ """
+ from browser_use.dom.views import EnhancedDOMTreeNode
+
+ def _find_in_descendants(n: EnhancedDOMTreeNode, depth: int) -> EnhancedDOMTreeNode | None:
+ if depth < 0:
+ return None
+ if self.is_file_input(n):
+ return n
+ for child in n.children_nodes or []:
+ result = _find_in_descendants(child, depth - 1)
+ if result:
+ return result
+ return None
+
+ current: EnhancedDOMTreeNode | None = node
+ for _ in range(max_height + 1):
+ if current is None:
+ break
+ # Check the current node itself
+ if self.is_file_input(current):
+ return current
+ # Check all descendants of the current node
+ result = _find_in_descendants(current, max_descendant_depth)
+ if result:
+ return result
+ # Check all siblings and their descendants
+ if current.parent_node:
+ for sibling in current.parent_node.children_nodes or []:
+ if sibling is current:
+ continue
+ if self.is_file_input(sibling):
+ return sibling
+ result = _find_in_descendants(sibling, max_descendant_depth)
+ if result:
+ return result
+ current = current.parent_node
+ return None
+
async def get_selector_map(self) -> dict[int, EnhancedDOMTreeNode]:
"""Get the current selector map from cached state or DOM watchdog.
@@ -2177,45 +2624,46 @@ class BrowserSession(BaseModel):
async def remove_highlights(self) -> None:
"""Remove highlights from the page using CDP."""
- if not self.browser_profile.highlight_elements:
+ if not self.browser_profile.highlight_elements and not self.browser_profile.dom_highlight_elements:
return
try:
- # Get cached session
- cdp_session = await self.get_or_create_cdp_session()
+ async with asyncio.timeout(3.0):
+ # Get cached session
+ cdp_session = await self.get_or_create_cdp_session()
- # Remove highlights via JavaScript - be thorough
- script = """
- (function() {
- // Remove all browser-use highlight elements
- const highlights = document.querySelectorAll('[data-browser-use-highlight]');
- console.log('Removing', highlights.length, 'browser-use highlight elements');
- highlights.forEach(el => el.remove());
+ # Remove highlights via JavaScript - be thorough
+ script = """
+ (function() {
+ // Remove all browser-use highlight elements
+ const highlights = document.querySelectorAll('[data-browser-use-highlight]');
+ console.log('Removing', highlights.length, 'browser-use highlight elements');
+ highlights.forEach(el => el.remove());
- // Also remove by ID in case selector missed anything
- const highlightContainer = document.getElementById('browser-use-debug-highlights');
- if (highlightContainer) {
- console.log('Removing highlight container by ID');
- highlightContainer.remove();
- }
+ // Also remove by ID in case selector missed anything
+ const highlightContainer = document.getElementById('browser-use-debug-highlights');
+ if (highlightContainer) {
+ console.log('Removing highlight container by ID');
+ highlightContainer.remove();
+ }
- // Final cleanup - remove any orphaned tooltips
- const orphanedTooltips = document.querySelectorAll('[data-browser-use-highlight="tooltip"]');
- orphanedTooltips.forEach(el => el.remove());
+ // Final cleanup - remove any orphaned tooltips
+ const orphanedTooltips = document.querySelectorAll('[data-browser-use-highlight="tooltip"]');
+ orphanedTooltips.forEach(el => el.remove());
- return { removed: highlights.length };
- })();
- """
- result = await cdp_session.cdp_client.send.Runtime.evaluate(
- params={'expression': script, 'returnByValue': True}, session_id=cdp_session.session_id
- )
+ return { removed: highlights.length };
+ })();
+ """
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
+ params={'expression': script, 'returnByValue': True}, session_id=cdp_session.session_id
+ )
- # Log the result for debugging
- if result and 'result' in result and 'value' in result['result']:
- removed_count = result['result']['value'].get('removed', 0)
- self.logger.debug(f'Successfully removed {removed_count} highlight elements')
- else:
- self.logger.debug('Highlight removal completed')
+ # Log the result for debugging
+ if result and 'result' in result and 'value' in result['result']:
+ removed_count = result['result']['value'].get('removed', 0)
+ self.logger.debug(f'Successfully removed {removed_count} highlight elements')
+ else:
+ self.logger.debug('Highlight removal completed')
except Exception as e:
self.logger.warning(f'Failed to remove highlights: {e}')
@@ -2861,16 +3309,16 @@ class BrowserSession(BaseModel):
async def _cdp_create_new_page(self, url: str = 'about:blank', background: bool = False, new_window: bool = False) -> str:
"""Create a new page/tab using CDP Target.createTarget. Returns target ID."""
+ # Only include newWindow when True, letting Chrome auto-create window as needed
+ params = CreateTargetParameters(url=url, background=background)
+ if new_window:
+ params['newWindow'] = True
# Use the root CDP client to create tabs at the browser level
if self._cdp_client_root:
- result = await self._cdp_client_root.send.Target.createTarget(
- params={'url': url, 'newWindow': new_window, 'background': background}
- )
+ result = await self._cdp_client_root.send.Target.createTarget(params=params)
else:
# Fallback to using cdp_client if root is not available
- result = await self.cdp_client.send.Target.createTarget(
- params={'url': url, 'newWindow': new_window, 'background': background}
- )
+ result = await self.cdp_client.send.Target.createTarget(params=params)
return result['targetId']
async def _cdp_close_page(self, target_id: TargetID) -> None:
@@ -2902,15 +3350,6 @@ class BrowserSession(BaseModel):
cdp_session = await self.get_or_create_cdp_session()
await cdp_session.cdp_client.send.Storage.clearCookies(session_id=cdp_session.session_id)
- async def _cdp_set_extra_headers(self, headers: dict[str, str]) -> None:
- """Set extra HTTP headers using CDP Network.setExtraHTTPHeaders."""
- if not self.agent_focus_target_id:
- return
-
- cdp_session = await self.get_or_create_cdp_session()
- # await cdp_session.cdp_client.send.Network.setExtraHTTPHeaders(params={'headers': headers}, session_id=cdp_session.session_id)
- raise NotImplementedError('Not implemented yet')
-
async def _cdp_grant_permissions(self, permissions: list[str], origin: str | None = None) -> None:
"""Grant permissions using CDP Browser.grantPermissions."""
params = {'permissions': permissions}
@@ -3135,6 +3574,11 @@ class BrowserSession(BaseModel):
if target_type in ('iframe', 'webview') and include_iframes:
type_allowed = True
+ # Chrome often reports empty URLs for cross-origin iframe targets (OOPIFs)
+ # initially via attachedToTarget, but they are still valid and accessible via CDP.
+ # Allow them through so get_all_frames() can resolve their frame trees.
+ if not url:
+ url_allowed = True
return url_allowed and type_allowed
@@ -3185,7 +3629,10 @@ class BrowserSession(BaseModel):
continue # Skip if no session available
else:
# Get cached session for this target (don't change focus - iterating frames)
- cdp_session = await self.get_or_create_cdp_session(target_id, focus=False)
+ try:
+ cdp_session = await self.get_or_create_cdp_session(target_id, focus=False)
+ except ValueError:
+ continue # Target may have detached between discovery and session creation
if cdp_session:
target_sessions[target_id] = cdp_session.session_id
diff --git a/browser_use/browser/session_manager.py b/browser_use/browser/session_manager.py
index 91ccdadab..b380dabed 100644
--- a/browser_use/browser/session_manager.py
+++ b/browser_use/browser/session_manager.py
@@ -401,6 +401,8 @@ class SessionManager:
if '-32001' not in error_str and 'Session with given id not found' not in error_str:
self.logger.debug(f'[SessionManager] Auto-attach failed for {target_type}: {e}')
+ from browser_use.browser.session import Target
+
async with self._lock:
# Track this session for the target
if target_id not in self._target_sessions:
@@ -409,23 +411,22 @@ class SessionManager:
self._target_sessions[target_id].add(session_id)
self._session_to_target[session_id] = target_id
- # Create or update Target (source of truth for url/title)
- if target_id not in self._targets:
- from browser_use.browser.session import Target
-
- target = Target(
- target_id=target_id,
- target_type=target_type,
- url=target_info.get('url', 'about:blank'),
- title=target_info.get('title', 'Unknown title'),
- )
- self._targets[target_id] = target
- self.logger.debug(f'[SessionManager] Created target {target_id[:8]}... (type={target_type})')
- else:
- # Update existing target info
- existing_target = self._targets[target_id]
- existing_target.url = target_info.get('url', existing_target.url)
- existing_target.title = target_info.get('title', existing_target.title)
+ # Create or update Target inside the same lock so that get_target() is never
+ # called in the window between _target_sessions being set and _targets being set.
+ if target_id not in self._targets:
+ target = Target(
+ target_id=target_id,
+ target_type=target_type,
+ url=target_info.get('url', 'about:blank'),
+ title=target_info.get('title', 'Unknown title'),
+ )
+ self._targets[target_id] = target
+ self.logger.debug(f'[SessionManager] Created target {target_id[:8]}... (type={target_type})')
+ else:
+ # Update existing target info
+ existing_target = self._targets[target_id]
+ existing_target.url = target_info.get('url', existing_target.url)
+ existing_target.title = target_info.get('title', existing_target.title)
# Create CDPSession (communication channel)
from browser_use.browser.session import CDPSession
@@ -441,6 +442,21 @@ class SessionManager:
# Add to sessions dict
self._sessions[session_id] = cdp_session
+ # If proxy auth is configured, enable Fetch auth handling on this session
+ # Avoids overwriting Target.attachedToTarget handlers elsewhere
+ try:
+ proxy_cfg = self.browser_session.browser_profile.proxy
+ username = proxy_cfg.username if proxy_cfg else None
+ password = proxy_cfg.password if proxy_cfg else None
+ if username and password:
+ await cdp_session.cdp_client.send.Fetch.enable(
+ params={'handleAuthRequests': True},
+ session_id=cdp_session.session_id,
+ )
+ self.logger.debug(f'[SessionManager] Fetch.enable(handleAuthRequests=True) on session {session_id[:8]}...')
+ except Exception as e:
+ self.logger.debug(f'[SessionManager] Fetch.enable on attached session failed: {type(e).__name__}: {e}')
+
self.logger.debug(
f'[SessionManager] Created session {session_id[:8]}... for target {target_id[:8]}... '
f'(total sessions: {len(self._sessions)})'
diff --git a/browser_use/browser/watchdog_base.py b/browser_use/browser/watchdog_base.py
index b7569fbd2..622e6bc78 100644
--- a/browser_use/browser/watchdog_base.py
+++ b/browser_use/browser/watchdog_base.py
@@ -1,5 +1,6 @@
"""Base watchdog class for browser monitoring components."""
+import asyncio
import inspect
import time
from collections.abc import Iterable
@@ -73,10 +74,54 @@ class BaseWatchdog(BaseModel):
watchdog_instance = getattr(handler, '__self__', None)
watchdog_class_name = watchdog_instance.__class__.__name__ if watchdog_instance else 'Unknown'
+ # Events that should always run even when CDP is disconnected (lifecycle management)
+ LIFECYCLE_EVENT_NAMES = frozenset(
+ {
+ 'BrowserStartEvent',
+ 'BrowserStopEvent',
+ 'BrowserStoppedEvent',
+ 'BrowserLaunchEvent',
+ 'BrowserErrorEvent',
+ 'BrowserKillEvent',
+ 'BrowserReconnectingEvent',
+ 'BrowserReconnectedEvent',
+ }
+ )
+
# Create a wrapper function with unique name to avoid duplicate handler warnings
# Capture handler by value to avoid closure issues
def make_unique_handler(actual_handler):
async def unique_handler(event):
+ # Circuit breaker: skip handler if CDP WebSocket is dead
+ # (prevents handlers from hanging on broken connections until timeout)
+ # Lifecycle events are exempt — they manage browser start/stop
+ if event.event_type not in LIFECYCLE_EVENT_NAMES and not browser_session.is_cdp_connected:
+ # If reconnection is in progress, wait for it instead of silently skipping
+ if browser_session.is_reconnecting:
+ wait_timeout = browser_session.RECONNECT_WAIT_TIMEOUT
+ browser_session.logger.debug(
+ f'🚌 [{watchdog_class_name}.{actual_handler.__name__}] ⏳ Waiting for reconnection ({wait_timeout}s)...'
+ )
+ try:
+ await asyncio.wait_for(browser_session._reconnect_event.wait(), timeout=wait_timeout)
+ except TimeoutError:
+ raise ConnectionError(
+ f'[{watchdog_class_name}.{actual_handler.__name__}] '
+ f'Reconnection wait timed out after {wait_timeout}s'
+ )
+ # After wait: check if reconnection actually succeeded
+ if not browser_session.is_cdp_connected:
+ raise ConnectionError(
+ f'[{watchdog_class_name}.{actual_handler.__name__}] Reconnection failed — CDP still not connected'
+ )
+ # Reconnection succeeded — fall through to execute handler normally
+ else:
+ # Not reconnecting — intentional stop, backward compat silent skip
+ browser_session.logger.debug(
+ f'🚌 [{watchdog_class_name}.{actual_handler.__name__}] ⚡ Skipped — CDP not connected'
+ )
+ return None
+
# just for debug logging, not used for anything else
parent_event = event_bus.event_history.get(event.event_parent_id) if event.event_parent_id else None
grandparent_event = (
diff --git a/browser_use/browser/watchdogs/aboutblank_watchdog.py b/browser_use/browser/watchdogs/aboutblank_watchdog.py
index e38d148f5..f6a7a740d 100644
--- a/browser_use/browser/watchdogs/aboutblank_watchdog.py
+++ b/browser_use/browser/watchdogs/aboutblank_watchdog.py
@@ -59,11 +59,14 @@ class AboutBlankWatchdog(BaseWatchdog):
async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
"""Check tabs when a tab is closed and proactively create about:blank if needed."""
- # logger.debug('[AboutBlankWatchdog] Tab closing, checking if we need to create about:blank tab')
-
# Don't create new tabs if browser is shutting down
if self._stopping:
- # logger.debug('[AboutBlankWatchdog] Browser is stopping, not creating new tabs')
+ return
+
+ # Don't attempt CDP operations if the WebSocket is dead — dispatching
+ # NavigateToUrlEvent on a broken connection will hang until timeout
+ if not self.browser_session.is_cdp_connected:
+ self.logger.debug('[AboutBlankWatchdog] CDP not connected, skipping tab recovery')
return
# Check if we're about to close the last tab (event happens BEFORE tab closes)
@@ -89,6 +92,9 @@ class AboutBlankWatchdog(BaseWatchdog):
async def _check_and_ensure_about_blank_tab(self) -> None:
"""Check current tabs and ensure exactly one about:blank tab with animation exists."""
try:
+ if not self.browser_session.is_cdp_connected:
+ return
+
# For quick checks, just get page targets without titles to reduce noise
page_targets = await self.browser_session._cdp_get_all_pages()
diff --git a/browser_use/browser/watchdogs/captcha_watchdog.py b/browser_use/browser/watchdogs/captcha_watchdog.py
new file mode 100644
index 000000000..bde1a727b
--- /dev/null
+++ b/browser_use/browser/watchdogs/captcha_watchdog.py
@@ -0,0 +1,207 @@
+"""Captcha solver watchdog — monitors captcha events from the browser proxy.
+
+Listens for BrowserUse.captchaSolverStarted/Finished CDP events and exposes a
+wait_if_captcha_solving() method that the agent step loop uses to block until
+a captcha is resolved (with a configurable timeout).
+
+NOTE: Only a single captcha solve is tracked at a time. If multiple captchas
+overlap (e.g. rapid successive navigations), only the latest one is tracked and
+earlier in-flight waits may return prematurely.
+"""
+
+import asyncio
+from dataclasses import dataclass
+from typing import Any, ClassVar, Literal
+
+from bubus import BaseEvent
+from cdp_use.cdp.browseruse.events import CaptchaSolverFinishedEvent as CDPCaptchaSolverFinishedEvent
+from cdp_use.cdp.browseruse.events import CaptchaSolverStartedEvent as CDPCaptchaSolverStartedEvent
+from pydantic import PrivateAttr
+
+from browser_use.browser.events import (
+ BrowserConnectedEvent,
+ BrowserStoppedEvent,
+ CaptchaSolverFinishedEvent,
+ CaptchaSolverStartedEvent,
+ _get_timeout,
+)
+from browser_use.browser.watchdog_base import BaseWatchdog
+
+CaptchaResultType = Literal['success', 'failed', 'timeout', 'unknown']
+
+
+@dataclass
+class CaptchaWaitResult:
+ """Result returned by wait_if_captcha_solving() when the agent had to wait."""
+
+ waited: bool
+ vendor: str
+ url: str
+ duration_ms: int
+ result: CaptchaResultType
+
+
+class CaptchaWatchdog(BaseWatchdog):
+ """Monitors captcha solver events from the browser proxy.
+
+ When the proxy detects a CAPTCHA and starts solving it, a CDP event
+ ``BrowserUse.captchaSolverStarted`` is sent over the WebSocket. This
+ watchdog catches that event and blocks the agent's step loop (via
+ ``wait_if_captcha_solving``) until ``BrowserUse.captchaSolverFinished``
+ arrives or the configurable timeout expires.
+ """
+
+ # Event contracts
+ LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [
+ BrowserConnectedEvent,
+ BrowserStoppedEvent,
+ ]
+ EMITS: ClassVar[list[type[BaseEvent]]] = [
+ CaptchaSolverStartedEvent,
+ CaptchaSolverFinishedEvent,
+ ]
+
+ # --- private state ---
+ _captcha_solving: bool = PrivateAttr(default=False)
+ _captcha_solved_event: asyncio.Event = PrivateAttr(default_factory=asyncio.Event)
+ _captcha_info: dict[str, Any] = PrivateAttr(default_factory=dict)
+ _captcha_result: CaptchaResultType = PrivateAttr(default='unknown')
+ _captcha_duration_ms: int = PrivateAttr(default=0)
+ _cdp_handlers_registered: bool = PrivateAttr(default=False)
+
+ def model_post_init(self, __context: Any) -> None:
+ # Start in "not blocked" state so callers never wait when there is no captcha.
+ self._captcha_solved_event.set()
+
+ # ------------------------------------------------------------------
+ # Event handlers
+ # ------------------------------------------------------------------
+
+ async def on_BrowserConnectedEvent(self, event: BrowserConnectedEvent) -> None:
+ """Register CDP event handlers for BrowserUse captcha solver events."""
+ if self._cdp_handlers_registered:
+ self.logger.debug('CaptchaWatchdog: CDP handlers already registered, skipping')
+ return
+
+ cdp_client = self.browser_session.cdp_client
+
+ def _on_captcha_started(event_data: CDPCaptchaSolverStartedEvent, session_id: str | None) -> None:
+ try:
+ self._captcha_solving = True
+ self._captcha_result = 'unknown'
+ self._captcha_duration_ms = 0
+ self._captcha_info = {
+ 'vendor': event_data.get('vendor', 'unknown'),
+ 'url': event_data.get('url', ''),
+ 'targetId': event_data.get('targetId', ''),
+ 'startedAt': event_data.get('startedAt', 0),
+ }
+ # Block any waiter
+ self._captcha_solved_event.clear()
+
+ vendor = self._captcha_info['vendor']
+ url = self._captcha_info['url']
+ self.logger.info(f'🔒 Captcha solving started: {vendor} on {url}')
+
+ self.event_bus.dispatch(
+ CaptchaSolverStartedEvent(
+ target_id=event_data.get('targetId', ''),
+ vendor=vendor,
+ url=url,
+ started_at=event_data.get('startedAt', 0),
+ )
+ )
+ except Exception:
+ self.logger.exception('Error handling captchaSolverStarted CDP event')
+ # Ensure consistent state: unblock any waiter
+ self._captcha_solving = False
+ self._captcha_solved_event.set()
+
+ def _on_captcha_finished(event_data: CDPCaptchaSolverFinishedEvent, session_id: str | None) -> None:
+ try:
+ success = event_data.get('success', False)
+ self._captcha_solving = False
+ self._captcha_duration_ms = event_data.get('durationMs', 0)
+ self._captcha_result = 'success' if success else 'failed'
+
+ vendor = event_data.get('vendor', self._captcha_info.get('vendor', 'unknown'))
+ url = event_data.get('url', self._captcha_info.get('url', ''))
+ duration_s = self._captcha_duration_ms / 1000
+
+ self.logger.info(f'🔓 Captcha solving finished: {self._captcha_result} — {vendor} on {url} ({duration_s:.1f}s)')
+
+ # Unblock any waiter
+ self._captcha_solved_event.set()
+
+ self.event_bus.dispatch(
+ CaptchaSolverFinishedEvent(
+ target_id=event_data.get('targetId', ''),
+ vendor=vendor,
+ url=url,
+ duration_ms=self._captcha_duration_ms,
+ finished_at=event_data.get('finishedAt', 0),
+ success=success,
+ )
+ )
+ except Exception:
+ self.logger.exception('Error handling captchaSolverFinished CDP event')
+ # Ensure consistent state: unblock any waiter
+ self._captcha_solving = False
+ self._captcha_solved_event.set()
+
+ cdp_client.register.BrowserUse.captchaSolverStarted(_on_captcha_started)
+ cdp_client.register.BrowserUse.captchaSolverFinished(_on_captcha_finished)
+ self._cdp_handlers_registered = True
+ self.logger.debug('🔒 CaptchaWatchdog: registered CDP event handlers for BrowserUse captcha events')
+
+ async def on_BrowserStoppedEvent(self, event: BrowserStoppedEvent) -> None:
+ """Clear captcha state when the browser disconnects so nothing hangs."""
+ self._captcha_solving = False
+ self._captcha_result = 'unknown'
+ self._captcha_duration_ms = 0
+ self._captcha_info = {}
+ self._captcha_solved_event.set()
+ self._cdp_handlers_registered = False
+
+ # ------------------------------------------------------------------
+ # Public API
+ # ------------------------------------------------------------------
+
+ async def wait_if_captcha_solving(self, timeout: float | None = None) -> CaptchaWaitResult | None:
+ """Wait if a captcha is currently being solved.
+
+ Returns:
+ ``None`` if no captcha was in progress.
+ A ``CaptchaWaitResult`` with the outcome otherwise.
+ """
+ if not self._captcha_solving:
+ return None
+
+ if timeout is None:
+ timeout = _get_timeout('TIMEOUT_CaptchaSolverWait', 120.0)
+ assert timeout is not None
+ vendor = self._captcha_info.get('vendor', 'unknown')
+ url = self._captcha_info.get('url', '')
+ self.logger.info(f'⏳ Waiting for {vendor} captcha to be solved on {url} (timeout={timeout}s)...')
+
+ try:
+ await asyncio.wait_for(self._captcha_solved_event.wait(), timeout=timeout)
+ return CaptchaWaitResult(
+ waited=True,
+ vendor=vendor,
+ url=url,
+ duration_ms=self._captcha_duration_ms,
+ result=self._captcha_result,
+ )
+ except TimeoutError:
+ # Timed out — unblock and report
+ self._captcha_solving = False
+ self._captcha_solved_event.set()
+ self.logger.warning(f'⏰ Captcha wait timed out after {timeout}s for {vendor} on {url}')
+ return CaptchaWaitResult(
+ waited=True,
+ vendor=vendor,
+ url=url,
+ duration_ms=int(timeout * 1000),
+ result='timeout',
+ )
diff --git a/browser_use/browser/watchdogs/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py
index d2eb1c96e..8686a7e4b 100644
--- a/browser_use/browser/watchdogs/default_action_watchdog.py
+++ b/browser_use/browser/watchdogs/default_action_watchdog.py
@@ -518,6 +518,11 @@ class DefaultActionWatchdog(BaseWatchdog):
raise BrowserError(error_msg)
try:
+
+ def invalidate_dom_cache() -> None:
+ if self.browser_session._dom_watchdog:
+ self.browser_session._dom_watchdog.clear_cache()
+
# Convert direction and amount to pixels
# Positive pixels = scroll down, negative = scroll up
pixels = event.amount if event.direction == 'down' else -event.amount
@@ -547,6 +552,7 @@ class DefaultActionWatchdog(BaseWatchdog):
# Wait a bit for the scroll to settle and DOM to update
await asyncio.sleep(0.2)
+ invalidate_dom_cache()
return None
# Perform target-level scroll
@@ -554,6 +560,7 @@ class DefaultActionWatchdog(BaseWatchdog):
# Note: We don't clear cached state here - let multi_act handle DOM change detection
# by explicitly rebuilding and comparing when needed
+ invalidate_dom_cache()
# Log success
self.logger.debug(f'📜 Scrolled {event.direction} by {event.amount} pixels')
@@ -612,10 +619,48 @@ class DefaultActionWatchdog(BaseWatchdog):
// Simple containment-based clickability logic
- const isClickable = this === elementAtPoint ||
+ let isClickable = this === elementAtPoint ||
this.contains(elementAtPoint) ||
elementAtPoint.contains(this);
+ // Check label-input associations when containment check fails
+ if (!isClickable) {
+ const target = this;
+ const atPoint = elementAtPoint;
+
+ // Case 1: target is , atPoint is its associated (or child of that label)
+ if (target.tagName === 'INPUT' && target.id) {
+ const escapedId = CSS.escape(target.id);
+ const assocLabel = document.querySelector('label[for="' + escapedId + '"]');
+ if (assocLabel && (assocLabel === atPoint || assocLabel.contains(atPoint))) {
+ isClickable = true;
+ }
+ }
+
+ // Case 2: target is , atPoint is inside a ancestor that wraps the target
+ if (!isClickable && target.tagName === 'INPUT') {
+ let ancestor = atPoint;
+ for (let i = 0; i < 3 && ancestor; i++) {
+ if (ancestor.tagName === 'LABEL' && ancestor.contains(target)) {
+ isClickable = true;
+ break;
+ }
+ ancestor = ancestor.parentElement;
+ }
+ }
+
+ // Case 3: target is , atPoint is the associated
+ if (!isClickable && target.tagName === 'LABEL') {
+ if (target.htmlFor && atPoint.tagName === 'INPUT' && atPoint.id === target.htmlFor) {
+ isClickable = true;
+ }
+ // Also check if atPoint is an input inside the label
+ if (!isClickable && atPoint.tagName === 'INPUT' && target.contains(atPoint)) {
+ isClickable = true;
+ }
+ }
+ }
+
return {
targetInfo: getElementInfo(this),
elementAtPointInfo: getElementInfo(elementAtPoint),
@@ -686,6 +731,32 @@ class DefaultActionWatchdog(BaseWatchdog):
# Get element bounds
backend_node_id = element_node.backend_node_id
+ # For checkbox/radio: capture pre-click state to verify toggle worked
+ is_toggle_element = tag_name == 'input' and element_type in ('checkbox', 'radio')
+ pre_click_checked: bool | None = None
+ checkbox_object_id: str | None = None
+ if is_toggle_element and backend_node_id:
+ try:
+ resolve_res = await cdp_session.cdp_client.send.DOM.resolveNode(
+ params={'backendNodeId': backend_node_id}, session_id=session_id
+ )
+ obj_info = resolve_res.get('object', {})
+ checkbox_object_id = obj_info.get('objectId') if obj_info else None
+ if not checkbox_object_id:
+ raise Exception('Failed to resolve checkbox element objectId')
+ state_res = await cdp_session.cdp_client.send.Runtime.callFunctionOn(
+ params={
+ 'functionDeclaration': 'function() { return this.checked; }',
+ 'objectId': checkbox_object_id,
+ 'returnByValue': True,
+ },
+ session_id=session_id,
+ )
+ pre_click_checked = state_res.get('result', {}).get('value')
+ self.logger.debug(f'Checkbox pre-click state: checked={pre_click_checked}')
+ except Exception as e:
+ self.logger.debug(f'Could not capture pre-click checkbox state: {e}')
+
# Get viewport dimensions for visibility checks
layout_metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=session_id)
viewport_width = layout_metrics['layoutViewport']['clientWidth']
@@ -883,6 +954,43 @@ class DefaultActionWatchdog(BaseWatchdog):
self.logger.debug('🖱️ Clicked successfully using x,y coordinates')
+ # For checkbox/radio: verify state toggled, fall back to JS element.click() if not
+ if is_toggle_element and pre_click_checked is not None and checkbox_object_id:
+ try:
+ await asyncio.sleep(0.05)
+ state_res = await cdp_session.cdp_client.send.Runtime.callFunctionOn(
+ params={
+ 'functionDeclaration': 'function() { return this.checked; }',
+ 'objectId': checkbox_object_id,
+ 'returnByValue': True,
+ },
+ session_id=session_id,
+ )
+ post_click_checked = state_res.get('result', {}).get('value')
+ if post_click_checked == pre_click_checked:
+ # CDP mouse events didn't toggle the checkbox — try JS element.click()
+ self.logger.debug(
+ f'Checkbox state unchanged after CDP click (checked={pre_click_checked}), using JS fallback'
+ )
+ await cdp_session.cdp_client.send.Runtime.callFunctionOn(
+ params={'functionDeclaration': 'function() { this.click(); }', 'objectId': checkbox_object_id},
+ session_id=session_id,
+ )
+ await asyncio.sleep(0.05)
+ final_res = await cdp_session.cdp_client.send.Runtime.callFunctionOn(
+ params={
+ 'functionDeclaration': 'function() { return this.checked; }',
+ 'objectId': checkbox_object_id,
+ 'returnByValue': True,
+ },
+ session_id=session_id,
+ )
+ post_click_checked = final_res.get('result', {}).get('value')
+ self.logger.debug(f'Checkbox post-click state: checked={post_click_checked}')
+ return {'click_x': center_x, 'click_y': center_y, 'checked': post_click_checked}
+ except Exception as e:
+ self.logger.debug(f'Checkbox state verification failed (non-critical): {e}')
+
# Return coordinates as dict for metadata
return {'click_x': center_x, 'click_y': center_y}
@@ -1294,10 +1402,8 @@ class DefaultActionWatchdog(BaseWatchdog):
return True
else:
self.logger.debug(f'⚠️ JavaScript clear partially failed, field still contains: "{final_text}"')
- return False
else:
self.logger.debug(f'❌ JavaScript clear failed: {clear_info.get("error", "Unknown error")}')
- return False
except Exception as e:
self.logger.debug(f'JavaScript clear failed with exception: {e}')
diff --git a/browser_use/browser/watchdogs/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py
index 6cadcac8a..62daa0397 100644
--- a/browser_use/browser/watchdogs/dom_watchdog.py
+++ b/browser_use/browser/watchdogs/dom_watchdog.py
@@ -264,12 +264,16 @@ class DOMWatchdog(BaseWatchdog):
not_a_meaningful_website = page_url.lower().split(':', 1)[0] not in ('http', 'https')
# Check for pending network requests BEFORE waiting (so we can see what's loading)
+ # Timeout after 2s — on slow CI machines or heavy pages, this call can hang
+ # for 15s+ eating into the 30s BrowserStateRequestEvent budget.
pending_requests_before_wait = []
if not not_a_meaningful_website:
try:
- pending_requests_before_wait = await self._get_pending_network_requests()
+ pending_requests_before_wait = await asyncio.wait_for(self._get_pending_network_requests(), timeout=2.0)
if pending_requests_before_wait:
self.logger.debug(f'🔍 Found {len(pending_requests_before_wait)} pending requests before stability wait')
+ except TimeoutError:
+ self.logger.debug('Pending network request check timed out (2s), skipping')
except Exception as e:
self.logger.debug(f'Failed to get pending requests before wait: {e}')
pending_requests = pending_requests_before_wait
diff --git a/browser_use/browser/watchdogs/downloads_watchdog.py b/browser_use/browser/watchdogs/downloads_watchdog.py
index 4baf3a75d..ecdc39994 100644
--- a/browser_use/browser/watchdogs/downloads_watchdog.py
+++ b/browser_use/browser/watchdogs/downloads_watchdog.py
@@ -62,8 +62,8 @@ class DownloadsWatchdog(BaseWatchdog):
_download_cdp_session: Any = PrivateAttr(default=None) # Store CDP session reference
_cdp_event_tasks: set[asyncio.Task] = PrivateAttr(default_factory=set) # Track CDP event handler tasks
_cdp_downloads_info: dict[str, dict[str, Any]] = PrivateAttr(default_factory=dict) # Map guid -> info
- _use_js_fetch_for_local: bool = PrivateAttr(default=False) # Guard JS fetch path for local regular downloads
_session_pdf_urls: dict[str, str] = PrivateAttr(default_factory=dict) # URL -> path for PDFs downloaded this session
+ _initial_downloads_snapshot: set[str] = PrivateAttr(default_factory=set) # Files present when watchdog started
_network_monitored_targets: set[str] = PrivateAttr(default_factory=set) # Track targets with network monitoring enabled
_detected_downloads: set[str] = PrivateAttr(default_factory=set) # Track detected download URLs to avoid duplicates
_network_callback_registered: bool = PrivateAttr(default=False) # Track if global network callback is registered
@@ -120,6 +120,15 @@ class DownloadsWatchdog(BaseWatchdog):
expanded_path.mkdir(parents=True, exist_ok=True)
self.logger.debug(f'[DownloadsWatchdog] Ensured downloads directory exists: {expanded_path}')
+ # Capture initial files to detect new downloads reliably
+ if expanded_path.exists():
+ for f in expanded_path.iterdir():
+ if f.is_file() and not f.name.startswith('.'):
+ self._initial_downloads_snapshot.add(f.name)
+ self.logger.debug(
+ f'[DownloadsWatchdog] Captured initial downloads: {len(self._initial_downloads_snapshot)} files'
+ )
+
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
"""Monitor new tabs for downloads."""
# logger.info(f'[DownloadsWatchdog] TabCreatedEvent received for tab {event.target_id[-4:]}: {event.url}')
@@ -192,6 +201,7 @@ class DownloadsWatchdog(BaseWatchdog):
self._session_pdf_urls.clear()
self._network_monitored_targets.clear()
self._detected_downloads.clear()
+ self._initial_downloads_snapshot.clear()
self._network_callback_registered = False
async def on_NavigationCompleteEvent(self, event: NavigationCompleteEvent) -> None:
@@ -326,10 +336,31 @@ class DownloadsWatchdog(BaseWatchdog):
except (KeyError, AttributeError):
pass
else:
- # No local file path provided, local polling in _handle_cdp_download will handle it
- self.logger.debug(
- '[DownloadsWatchdog] No filePath in progress event (local); polling will handle detection'
- )
+ # No filePath provided - detect by comparing with initial snapshot
+ self.logger.debug('[DownloadsWatchdog] No filePath in progress event; detecting via filesystem')
+ downloads_path = self.browser_session.browser_profile.downloads_path
+ if downloads_path:
+ downloads_dir = Path(downloads_path).expanduser().resolve()
+ if downloads_dir.exists():
+ for f in downloads_dir.iterdir():
+ if (
+ f.is_file()
+ and not f.name.startswith('.')
+ and f.name not in self._initial_downloads_snapshot
+ ):
+ # Check file has content before processing
+ if f.stat().st_size > 4:
+ # Found a new file! Add to snapshot immediately to prevent duplicate detection
+ self._initial_downloads_snapshot.add(f.name)
+ self.logger.debug(f'[DownloadsWatchdog] Detected new download: {f.name}')
+ self._track_download(str(f))
+ # Mark as handled
+ try:
+ if guid in self._cdp_downloads_info:
+ self._cdp_downloads_info[guid]['handled'] = True
+ except (KeyError, AttributeError):
+ pass
+ break
else:
# Remote browser: do not touch local filesystem. Fallback to downloadPath+suggestedFilename
info = self._cdp_downloads_info.get(guid, {})
@@ -456,17 +487,24 @@ class DownloadsWatchdog(BaseWatchdog):
response = event.get('response', {})
url = response.get('url', '')
content_type = response.get('mimeType', '').lower()
- headers = response.get('headers', {})
+ headers = {
+ k.lower(): v for k, v in response.get('headers', {}).items()
+ } # Normalize for case-insensitive lookup
+ request_type = event.get('type', '')
# Skip non-HTTP URLs (data:, about:, chrome-extension:, etc.)
if not url.startswith('http'):
return
+ # Skip fetch/XHR - real browsers don't download PDFs from programmatic requests
+ if request_type in ('Fetch', 'XHR'):
+ return
+
# Check if it's a PDF
is_pdf = 'application/pdf' in content_type
# Check if it's marked as download via Content-Disposition header
- content_disposition = headers.get('content-disposition', '').lower()
+ content_disposition = str(headers.get('content-disposition', '')).lower()
is_download_attachment = 'attachment' in content_disposition
# Filter out image/video/audio files even if marked as attachment
@@ -518,6 +556,14 @@ class DownloadsWatchdog(BaseWatchdog):
if not (is_pdf or is_download_attachment):
return
+ # If already downloaded this URL and file still exists, do nothing
+ existing_path = self._session_pdf_urls.get(url)
+ if existing_path:
+ if os.path.exists(existing_path):
+ return
+ # Stale cache entry, allow re-download
+ del self._session_pdf_urls[url]
+
# Check if we've already processed this URL in this session
if url in self._detected_downloads:
self.logger.debug(f'[DownloadsWatchdog] Already detected download: {url[:80]}...')
@@ -543,6 +589,7 @@ class DownloadsWatchdog(BaseWatchdog):
# Trigger download asynchronously in background (don't block event handler)
async def download_in_background():
+ # Don't permanently block re-processing this URL if download fails
try:
download_path = await self.download_file_from_url(
url=url,
@@ -557,6 +604,9 @@ class DownloadsWatchdog(BaseWatchdog):
self.logger.warning(f'[DownloadsWatchdog] ⚠️ Failed to download: {url[:80]}...')
except Exception as e:
self.logger.error(f'[DownloadsWatchdog] Error downloading in background: {type(e).__name__}: {e}')
+ finally:
+ # Allow future detections of the same URL
+ self._detected_downloads.discard(url)
# Create background task
task = create_task_with_error_handling(
@@ -611,8 +661,13 @@ class DownloadsWatchdog(BaseWatchdog):
# Check if already downloaded in this session
if url in self._session_pdf_urls:
existing_path = self._session_pdf_urls[url]
- self.logger.debug(f'[DownloadsWatchdog] File already downloaded in session: {existing_path}')
- return existing_path
+ if os.path.exists(existing_path):
+ self.logger.debug(f'[DownloadsWatchdog] File already downloaded in session: {existing_path}')
+ return existing_path
+
+ # Stale cache entry: the file was removed/cleaned up after we cached it.
+ self.logger.debug(f'[DownloadsWatchdog] Cached download path no longer exists, re-downloading: {existing_path}')
+ del self._session_pdf_urls[url]
try:
# Get or create CDP session for this target
@@ -814,107 +869,6 @@ class DownloadsWatchdog(BaseWatchdog):
# We just need to wait for it to appear in the downloads directory
expected_path = downloads_dir / suggested_filename
- # Debug: List current directory contents
- self.logger.debug(f'[DownloadsWatchdog] Downloads directory: {downloads_dir}')
- if downloads_dir.exists():
- files_before = list(downloads_dir.iterdir())
- self.logger.debug(f'[DownloadsWatchdog] Files before download: {[f.name for f in files_before]}')
-
- # Try manual JavaScript fetch as a fallback for local browsers (disabled for regular local downloads)
- if self.browser_session.is_local and self._use_js_fetch_for_local:
- self.logger.debug(f'[DownloadsWatchdog] Attempting JS fetch fallback for {download_url}')
-
- unique_filename = None
- file_size = None
- download_result = None
- try:
- # Escape the URL for JavaScript
- import json
-
- escaped_url = json.dumps(download_url)
-
- # Get the proper session for the frame that initiated the download
- cdp_session = await self.browser_session.cdp_client_for_frame(event.get('frameId'))
- assert cdp_session
-
- result = await cdp_session.cdp_client.send.Runtime.evaluate(
- params={
- 'expression': f"""
- (async () => {{
- try {{
- const response = await fetch({escaped_url});
- if (!response.ok) {{
- throw new Error(`HTTP error! status: ${{response.status}}`);
- }}
- const blob = await response.blob();
- const arrayBuffer = await blob.arrayBuffer();
- const uint8Array = new Uint8Array(arrayBuffer);
- return {{
- data: Array.from(uint8Array),
- size: uint8Array.length,
- contentType: response.headers.get('content-type') || 'application/octet-stream'
- }};
- }} catch (error) {{
- throw new Error(`Fetch failed: ${{error.message}}`);
- }}
- }})()
- """,
- 'awaitPromise': True,
- 'returnByValue': True,
- },
- session_id=cdp_session.session_id,
- )
- download_result = result.get('result', {}).get('value')
-
- if download_result and download_result.get('data'):
- # Save the file
- file_data = bytes(download_result['data'])
- file_size = len(file_data)
-
- # Ensure unique filename
- unique_filename = await self._get_unique_filename(str(downloads_dir), suggested_filename)
- final_path = downloads_dir / unique_filename
-
- # Write the file
- import anyio
-
- async with await anyio.open_file(final_path, 'wb') as f:
- await f.write(file_data)
-
- self.logger.debug(f'[DownloadsWatchdog] ✅ Downloaded and saved file: {final_path} ({file_size} bytes)')
- expected_path = final_path
- # Emit download event immediately
- file_ext = expected_path.suffix.lower().lstrip('.')
- file_type = file_ext if file_ext else None
- self.event_bus.dispatch(
- FileDownloadedEvent(
- guid=guid,
- url=download_url,
- path=str(expected_path),
- file_name=unique_filename or expected_path.name,
- file_size=file_size or 0,
- file_type=file_type,
- mime_type=(download_result.get('contentType') if download_result else None),
- from_cache=False,
- auto_download=False,
- )
- )
- # Mark as handled to prevent duplicate dispatch from progress/polling paths
- try:
- if guid in self._cdp_downloads_info:
- self._cdp_downloads_info[guid]['handled'] = True
- except (KeyError, AttributeError):
- pass
- self.logger.debug(
- f'[DownloadsWatchdog] ✅ File download completed via CDP: {suggested_filename} ({file_size} bytes) saved to {expected_path}'
- )
- return
- else:
- self.logger.error('[DownloadsWatchdog] ❌ No data received from fetch')
-
- except Exception as fetch_error:
- self.logger.error(f'[DownloadsWatchdog] ❌ Failed to download file via fetch: {fetch_error}')
-
# For remote browsers, don't poll local filesystem; downloadProgress handler will emit the event
if not self.browser_session.is_local:
return
@@ -925,24 +879,23 @@ class DownloadsWatchdog(BaseWatchdog):
# Poll the downloads directory for new files
self.logger.debug(f'[DownloadsWatchdog] Checking if browser auto-download saved the file for us: {suggested_filename}')
- # Get initial list of files in downloads directory
- initial_files = set()
- if Path(downloads_dir).exists():
- for f in Path(downloads_dir).iterdir():
- if f.is_file() and not f.name.startswith('.'):
- initial_files.add(f.name)
-
# Poll for new files
max_wait = 20 # seconds
start_time = asyncio.get_event_loop().time()
- while asyncio.get_event_loop().time() - start_time < max_wait:
+ while asyncio.get_event_loop().time() - start_time < max_wait: # noqa: ASYNC110
await asyncio.sleep(5.0) # Check every 5 seconds
if Path(downloads_dir).exists():
for file_path in Path(downloads_dir).iterdir():
# Skip hidden files and files that were already there
- if file_path.is_file() and not file_path.name.startswith('.') and file_path.name not in initial_files:
+ if (
+ file_path.is_file()
+ and not file_path.name.startswith('.')
+ and file_path.name not in self._initial_downloads_snapshot
+ ):
+ # Add to snapshot immediately to prevent duplicate detection
+ self._initial_downloads_snapshot.add(file_path.name)
# Check if file has content (> 4 bytes)
try:
file_size = file_path.stat().st_size
@@ -971,13 +924,13 @@ class DownloadsWatchdog(BaseWatchdog):
file_type=file_type,
)
)
- # Mark as handled after dispatch
- try:
- if guid in self._cdp_downloads_info:
- self._cdp_downloads_info[guid]['handled'] = True
- except (KeyError, AttributeError):
- pass
- return
+ # Mark as handled after dispatch
+ try:
+ if guid in self._cdp_downloads_info:
+ self._cdp_downloads_info[guid]['handled'] = True
+ except (KeyError, AttributeError):
+ pass
+ return
except Exception as e:
self.logger.debug(f'[DownloadsWatchdog] Error checking file {file_path}: {e}')
diff --git a/browser_use/browser/watchdogs/har_recording_watchdog.py b/browser_use/browser/watchdogs/har_recording_watchdog.py
index f2a82ae37..cf6d6331c 100644
--- a/browser_use/browser/watchdogs/har_recording_watchdog.py
+++ b/browser_use/browser/watchdogs/har_recording_watchdog.py
@@ -665,7 +665,7 @@ class HarRecordingWatchdog(BaseWatchdog):
tmp_path = self._har_path.with_suffix(self._har_path.suffix + '.tmp')
# Write as bytes explicitly to avoid any text/binary mode confusion in different environments
- tmp_path.write_bytes(json.dumps(har_obj, indent=2).encode('utf-8'))
+ tmp_path.write_bytes(json.dumps(har_obj, indent=2, ensure_ascii=False).encode('utf-8'))
tmp_path.replace(self._har_path)
def _format_page_started_datetime(self, timestamp: float | None) -> str:
diff --git a/browser_use/browser/watchdogs/local_browser_watchdog.py b/browser_use/browser/watchdogs/local_browser_watchdog.py
index 296bda998..de56f7cb1 100644
--- a/browser_use/browser/watchdogs/local_browser_watchdog.py
+++ b/browser_use/browser/watchdogs/local_browser_watchdog.py
@@ -1,5 +1,7 @@
"""Local browser watchdog for managing browser subprocess lifecycle."""
+from __future__ import annotations
+
import asyncio
import os
import shutil
@@ -21,7 +23,7 @@ from browser_use.browser.watchdog_base import BaseWatchdog
from browser_use.observability import observe_debug
if TYPE_CHECKING:
- pass
+ from browser_use.browser.profile import BrowserChannel
class LocalBrowserWatchdog(BaseWatchdog):
@@ -124,8 +126,8 @@ class LocalBrowserWatchdog(BaseWatchdog):
self.logger.debug(f'[LocalBrowserWatchdog] 📦 Using custom local browser executable_path= {browser_path}')
else:
# self.logger.debug('[LocalBrowserWatchdog] 🔍 Looking for local browser binary path...')
- # Try fallback paths first (system browsers preferred)
- browser_path = self._find_installed_browser_path()
+ # Try fallback paths first (Playwright's Chromium preferred by default)
+ browser_path = self._find_installed_browser_path(channel=profile.channel)
if not browser_path:
self.logger.error(
'[LocalBrowserWatchdog] ⚠️ No local browser binary found, installing browser using playwright subprocess...'
@@ -215,14 +217,18 @@ class LocalBrowserWatchdog(BaseWatchdog):
raise RuntimeError(f'Failed to launch browser after {max_retries} attempts')
@staticmethod
- def _find_installed_browser_path() -> str | None:
+ def _find_installed_browser_path(channel: BrowserChannel | None = None) -> str | None:
"""Try to find browser executable from common fallback locations.
+ If a channel is specified, paths for that browser are searched first.
+ Falls back to all known browser paths if the channel-specific search fails.
+
Prioritizes:
- 1. System Chrome Stable
- 1. Playwright chromium
- 2. Other system native browsers (Chromium -> Chrome Canary/Dev -> Brave)
- 3. Playwright headless-shell fallback
+ 1. Channel-specific paths (if channel is set to a non-default value)
+ 2. Playwright bundled Chromium (when no channel or default channel specified)
+ 3. System Chrome stable
+ 4. Other system native browsers (Chromium -> Chrome Canary/Dev -> Brave -> Edge)
+ 5. Playwright headless-shell fallback
Returns:
Path to browser executable or None if not found
@@ -231,60 +237,90 @@ class LocalBrowserWatchdog(BaseWatchdog):
import platform
from pathlib import Path
+ from browser_use.browser.profile import BROWSERUSE_DEFAULT_CHANNEL, BrowserChannel
+
system = platform.system()
- patterns = []
# Get playwright browsers path from environment variable if set
playwright_path = os.environ.get('PLAYWRIGHT_BROWSERS_PATH')
+ # Build tagged pattern lists per OS: (browser_group, path)
+ # browser_group is used to match against the requested channel
if system == 'Darwin': # macOS
if not playwright_path:
playwright_path = '~/Library/Caches/ms-playwright'
- patterns = [
- '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
- f'{playwright_path}/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
- '/Applications/Chromium.app/Contents/MacOS/Chromium',
- '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
- '/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
- f'{playwright_path}/chromium_headless_shell-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
+ all_patterns = [
+ ('chrome', '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'),
+ ('chromium', f'{playwright_path}/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
+ ('chromium', '/Applications/Chromium.app/Contents/MacOS/Chromium'),
+ ('chrome-canary', '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'),
+ ('brave', '/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'),
+ ('msedge', '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'),
+ ('chromium', f'{playwright_path}/chromium_headless_shell-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
]
elif system == 'Linux':
if not playwright_path:
playwright_path = '~/.cache/ms-playwright'
- patterns = [
- '/usr/bin/google-chrome-stable',
- '/usr/bin/google-chrome',
- '/usr/local/bin/google-chrome',
- f'{playwright_path}/chromium-*/chrome-linux*/chrome',
- '/usr/bin/chromium',
- '/usr/bin/chromium-browser',
- '/usr/local/bin/chromium',
- '/snap/bin/chromium',
- '/usr/bin/google-chrome-beta',
- '/usr/bin/google-chrome-dev',
- '/usr/bin/brave-browser',
- f'{playwright_path}/chromium_headless_shell-*/chrome-linux*/chrome',
+ all_patterns = [
+ ('chrome', '/usr/bin/google-chrome-stable'),
+ ('chrome', '/usr/bin/google-chrome'),
+ ('chrome', '/usr/local/bin/google-chrome'),
+ ('chromium', f'{playwright_path}/chromium-*/chrome-linux*/chrome'),
+ ('chromium', '/usr/bin/chromium'),
+ ('chromium', '/usr/bin/chromium-browser'),
+ ('chromium', '/usr/local/bin/chromium'),
+ ('chromium', '/snap/bin/chromium'),
+ ('chrome-beta', '/usr/bin/google-chrome-beta'),
+ ('chrome-dev', '/usr/bin/google-chrome-dev'),
+ ('brave', '/usr/bin/brave-browser'),
+ ('msedge', '/usr/bin/microsoft-edge-stable'),
+ ('msedge', '/usr/bin/microsoft-edge'),
+ ('chromium', f'{playwright_path}/chromium_headless_shell-*/chrome-linux*/chrome'),
]
elif system == 'Windows':
if not playwright_path:
playwright_path = r'%LOCALAPPDATA%\ms-playwright'
- patterns = [
- r'C:\Program Files\Google\Chrome\Application\chrome.exe',
- r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe',
- r'%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe',
- r'%PROGRAMFILES%\Google\Chrome\Application\chrome.exe',
- r'%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe',
- f'{playwright_path}\\chromium-*\\chrome-win\\chrome.exe',
- r'C:\Program Files\Chromium\Application\chrome.exe',
- r'C:\Program Files (x86)\Chromium\Application\chrome.exe',
- r'%LOCALAPPDATA%\Chromium\Application\chrome.exe',
- r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe',
- r'C:\Program Files (x86)\BraveSoftware\Brave-Browser\Application\brave.exe',
- r'C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe',
- r'C:\Program Files\Microsoft\Edge\Application\msedge.exe',
- r'%LOCALAPPDATA%\Microsoft\Edge\Application\msedge.exe',
- f'{playwright_path}\\chromium_headless_shell-*\\chrome-win\\chrome.exe',
+ all_patterns = [
+ ('chrome', r'C:\Program Files\Google\Chrome\Application\chrome.exe'),
+ ('chrome', r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'),
+ ('chrome', r'%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe'),
+ ('chrome', r'%PROGRAMFILES%\Google\Chrome\Application\chrome.exe'),
+ ('chrome', r'%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe'),
+ ('chromium', f'{playwright_path}\\chromium-*\\chrome-win\\chrome.exe'),
+ ('chromium', r'C:\Program Files\Chromium\Application\chrome.exe'),
+ ('chromium', r'C:\Program Files (x86)\Chromium\Application\chrome.exe'),
+ ('chromium', r'%LOCALAPPDATA%\Chromium\Application\chrome.exe'),
+ ('brave', r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'),
+ ('brave', r'C:\Program Files (x86)\BraveSoftware\Brave-Browser\Application\brave.exe'),
+ ('msedge', r'C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe'),
+ ('msedge', r'C:\Program Files\Microsoft\Edge\Application\msedge.exe'),
+ ('msedge', r'%LOCALAPPDATA%\Microsoft\Edge\Application\msedge.exe'),
+ ('chromium', f'{playwright_path}\\chromium_headless_shell-*\\chrome-win\\chrome.exe'),
]
+ else:
+ all_patterns = []
+
+ # Map channel enum values to browser group tags
+ _channel_to_group: dict[BrowserChannel, str] = {
+ BrowserChannel.CHROME: 'chrome',
+ BrowserChannel.CHROME_BETA: 'chrome-beta',
+ BrowserChannel.CHROME_DEV: 'chrome-dev',
+ BrowserChannel.CHROME_CANARY: 'chrome-canary',
+ BrowserChannel.CHROMIUM: 'chromium',
+ BrowserChannel.MSEDGE: 'msedge',
+ BrowserChannel.MSEDGE_BETA: 'msedge',
+ BrowserChannel.MSEDGE_DEV: 'msedge',
+ BrowserChannel.MSEDGE_CANARY: 'msedge',
+ }
+
+ # Prioritize the target browser group, then fall back to the rest.
+ if channel and channel != BROWSERUSE_DEFAULT_CHANNEL and channel in _channel_to_group:
+ target_group = _channel_to_group[channel]
+ else:
+ target_group = _channel_to_group[BROWSERUSE_DEFAULT_CHANNEL]
+ prioritized = [p for g, p in all_patterns if g == target_group]
+ rest = [p for g, p in all_patterns if g != target_group]
+ patterns = prioritized + rest
for pattern in patterns:
# Expand user home directory
@@ -326,7 +362,7 @@ class LocalBrowserWatchdog(BaseWatchdog):
import platform
# Build command - only use --with-deps on Linux (it fails on Windows/macOS)
- cmd = ['uvx', 'playwright', 'install', 'chrome']
+ cmd = ['uvx', 'playwright', 'install', 'chromium']
if platform.system() == 'Linux':
cmd.append('--with-deps')
@@ -344,7 +380,7 @@ class LocalBrowserWatchdog(BaseWatchdog):
if browser_path:
return browser_path
self.logger.error(f'[LocalBrowserWatchdog] ❌ Playwright local browser installation error: \n{stdout}\n{stderr}')
- raise RuntimeError('No local browser path found after: uvx playwright install chrome')
+ raise RuntimeError('No local browser path found after: uvx playwright install chromium')
except TimeoutError:
# Kill the subprocess if it times out
process.kill()
diff --git a/browser_use/browser/watchdogs/screenshot_watchdog.py b/browser_use/browser/watchdogs/screenshot_watchdog.py
index e41db4a18..713444ece 100644
--- a/browser_use/browser/watchdogs/screenshot_watchdog.py
+++ b/browser_use/browser/watchdogs/screenshot_watchdog.py
@@ -52,8 +52,26 @@ class ScreenshotWatchdog(BaseWatchdog):
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=True)
+ # Remove highlights BEFORE taking the screenshot so they don't appear in the image.
+ # Done here (not in finally) so CancelledError is never swallowed — any await in a
+ # finally block can suppress external task cancellation.
+ # remove_highlights() has its own asyncio.timeout(3.0) internally so it won't block.
+ try:
+ await self.browser_session.remove_highlights()
+ except Exception:
+ pass
+
# Prepare screenshot parameters
- params = CaptureScreenshotParameters(format='png', captureBeyondViewport=False)
+ params_dict: dict[str, Any] = {'format': 'png', 'captureBeyondViewport': event.full_page}
+ if event.clip:
+ params_dict['clip'] = {
+ 'x': event.clip['x'],
+ 'y': event.clip['y'],
+ 'width': event.clip['width'],
+ 'height': event.clip['height'],
+ 'scale': 1,
+ }
+ params = CaptureScreenshotParameters(**params_dict)
# Take screenshot using CDP
self.logger.debug(f'[ScreenshotWatchdog] Taking screenshot with params: {params}')
@@ -68,9 +86,3 @@ class ScreenshotWatchdog(BaseWatchdog):
except Exception as e:
self.logger.error(f'[ScreenshotWatchdog] Screenshot failed: {e}')
raise
- finally:
- # Try to remove highlights even on failure
- try:
- await self.browser_session.remove_highlights()
- except Exception:
- pass
diff --git a/browser_use/browser/watchdogs/security_watchdog.py b/browser_use/browser/watchdogs/security_watchdog.py
index f95a26260..176f2c5a5 100644
--- a/browser_use/browser/watchdogs/security_watchdog.py
+++ b/browser_use/browser/watchdogs/security_watchdog.py
@@ -68,7 +68,6 @@ class SecurityWatchdog(BaseWatchdog):
await session.cdp_client.send.Page.navigate(params={'url': 'about:blank'}, session_id=session.session_id)
self.logger.info(f'⛔️ Navigated to about:blank after blocked URL: {event.url}')
except Exception as e:
- pass
self.logger.error(f'⛔️ Failed to navigate to about:blank: {type(e).__name__} {e}')
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
diff --git a/browser_use/browser/watchdogs/storage_state_watchdog.py b/browser_use/browser/watchdogs/storage_state_watchdog.py
index bf400e665..736235129 100644
--- a/browser_use/browser/watchdogs/storage_state_watchdog.py
+++ b/browser_use/browser/watchdogs/storage_state_watchdog.py
@@ -202,7 +202,7 @@ class StorageStateWatchdog(BaseWatchdog):
# Write atomically
temp_path = json_path.with_suffix('.json.tmp')
- temp_path.write_text(json.dumps(merged_state, indent=4))
+ temp_path.write_text(json.dumps(merged_state, indent=4, ensure_ascii=False), encoding='utf-8')
# Backup existing file
if json_path.exists():
@@ -249,25 +249,60 @@ class StorageStateWatchdog(BaseWatchdog):
# Apply cookies if present
if 'cookies' in storage and storage['cookies']:
- await self.browser_session._cdp_set_cookies(storage['cookies'])
+ # Playwright exports session cookies with expires=0/-1. CDP treats expires=0 as expired.
+ # Normalize session cookies by omitting expires
+ normalized_cookies: list[Cookie] = []
+ for cookie in storage['cookies']:
+ if not isinstance(cookie, dict):
+ normalized_cookies.append(cookie) # type: ignore[arg-type]
+ continue
+ c = dict(cookie)
+ expires = c.get('expires')
+ if expires in (0, 0.0, -1, -1.0):
+ c.pop('expires', None)
+ normalized_cookies.append(Cookie(**c))
+
+ await self.browser_session._cdp_set_cookies(normalized_cookies)
self._last_cookie_state = storage['cookies'].copy()
self.logger.debug(f'[StorageStateWatchdog] Added {len(storage["cookies"])} cookies from storage state')
# Apply origins (localStorage/sessionStorage) if present
if 'origins' in storage and storage['origins']:
for origin in storage['origins']:
- if 'localStorage' in origin:
+ origin_value = origin.get('origin')
+ if not origin_value:
+ continue
+
+ # Scope storage restoration to its origin to avoid cross-site pollution.
+ if origin.get('localStorage'):
+ lines = []
for item in origin['localStorage']:
- script = f"""
- window.localStorage.setItem({json.dumps(item['name'])}, {json.dumps(item['value'])});
- """
- await self.browser_session._cdp_add_init_script(script)
- if 'sessionStorage' in origin:
+ lines.append(f'window.localStorage.setItem({json.dumps(item["name"])}, {json.dumps(item["value"])});')
+ script = (
+ '(function(){\n'
+ f' if (window.location && window.location.origin !== {json.dumps(origin_value)}) return;\n'
+ ' try {\n'
+ f' {" ".join(lines)}\n'
+ ' } catch (e) {}\n'
+ '})();'
+ )
+ await self.browser_session._cdp_add_init_script(script)
+
+ if origin.get('sessionStorage'):
+ lines = []
for item in origin['sessionStorage']:
- script = f"""
- window.sessionStorage.setItem({json.dumps(item['name'])}, {json.dumps(item['value'])});
- """
- await self.browser_session._cdp_add_init_script(script)
+ lines.append(
+ f'window.sessionStorage.setItem({json.dumps(item["name"])}, {json.dumps(item["value"])});'
+ )
+ script = (
+ '(function(){\n'
+ f' if (window.location && window.location.origin !== {json.dumps(origin_value)}) return;\n'
+ ' try {\n'
+ f' {" ".join(lines)}\n'
+ ' } catch (e) {}\n'
+ '})();'
+ )
+ await self.browser_session._cdp_add_init_script(script)
self.logger.debug(
f'[StorageStateWatchdog] Applied localStorage/sessionStorage from {len(storage["origins"])} origins'
)
diff --git a/browser_use/cli.py b/browser_use/cli.py
index 5b32f1779..631239146 100644
--- a/browser_use/cli.py
+++ b/browser_use/cli.py
@@ -129,7 +129,7 @@ if '--template' in sys.argv:
click.echo(' uv pip install browser-use')
click.echo(' 2. Set up your API key in .env file or environment:')
click.echo(' BROWSER_USE_API_KEY=your-key')
- click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key)')
+ click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=cli)')
click.echo(' 3. Run your script:')
click.echo(f' python {output_path.name}')
except Exception as e:
@@ -178,9 +178,12 @@ except ImportError:
try:
import readline
+ _add_history = getattr(readline, 'add_history', None)
+ if _add_history is None:
+ raise ImportError('readline missing add_history')
READLINE_AVAILABLE = True
except ImportError:
- # readline not available on Windows by default
+ _add_history = None
READLINE_AVAILABLE = False
@@ -294,8 +297,8 @@ def save_user_config(config: dict[str, Any]) -> None:
# Save to separate history file
history_file = CONFIG.BROWSER_USE_CONFIG_DIR / 'command_history.json'
- with open(history_file, 'w') as f:
- json.dump(history, f, indent=2)
+ with open(history_file, 'w', encoding='utf-8') as f:
+ json.dump(history, f, indent=2, ensure_ascii=False)
def update_config_with_click_args(config: dict[str, Any], ctx: click.Context) -> dict[str, Any]:
@@ -341,12 +344,11 @@ def update_config_with_click_args(config: dict[str, Any], ctx: click.Context) ->
def setup_readline_history(history: list[str]) -> None:
"""Set up readline with command history."""
- if not READLINE_AVAILABLE:
+ if not _add_history:
return
- # Add history items to readline
for item in history:
- readline.add_history(item)
+ _add_history(item)
def get_llm(config: dict[str, Any]):
@@ -694,8 +696,6 @@ class BrowserUseApp(App):
'trafilatura.htmlprocessing',
'trafilatura',
'groq',
- 'portalocker',
- 'portalocker.utils',
]:
third_party = logging.getLogger(logger_name)
third_party.setLevel(logging.ERROR)
@@ -720,9 +720,9 @@ class BrowserUseApp(App):
# Step 2: Set up input history
logger.debug('Setting up readline history...')
try:
- if READLINE_AVAILABLE and self.task_history:
+ if READLINE_AVAILABLE and self.task_history and _add_history:
for item in self.task_history:
- readline.add_history(item)
+ _add_history(item)
logger.debug(f'Added {len(self.task_history)} items to readline history')
else:
logger.debug('No readline history to set up')
@@ -1129,7 +1129,7 @@ class BrowserUseApp(App):
# Exit the application
self.exit()
- print('\nTry running tasks on our cloud: https://browser-use.com')
+ print('\nTry running tasks on our cloud: https://browser-use.com?utm_source=oss&utm_medium=cli')
def compose(self) -> ComposeResult:
"""Create the UI layout."""
@@ -1144,7 +1144,11 @@ class BrowserUseApp(App):
with Container(id='links-panel'):
with HorizontalGroup(classes='link-row'):
yield Static('Run at scale on cloud: [blink]☁️[/] ', markup=True, classes='link-label')
- yield Link('https://browser-use.com', url='https://browser-use.com', classes='link-white link-url')
+ yield Link(
+ 'https://browser-use.com',
+ url='https://browser-use.com?utm_source=oss&utm_medium=cli',
+ classes='link-white link-url',
+ )
yield Static('') # Empty line
@@ -2224,7 +2228,7 @@ def _run_template_generation(template: str, output: str | None, force: bool):
click.echo(' uv pip install browser-use')
click.echo(' 2. Set up your API key in .env file or environment:')
click.echo(' BROWSER_USE_API_KEY=your-key')
- click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key)')
+ click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=cli)')
click.echo(' 3. Run your script:')
click.echo(f' python {output_path.name}')
else:
@@ -2353,7 +2357,7 @@ def init(
click.echo(' uv pip install browser-use')
click.echo(' 2. Set up your API key in .env file or environment:')
click.echo(' BROWSER_USE_API_KEY=your-key')
- click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key)')
+ click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=cli)')
click.echo(' 3. Run your script:')
click.echo(f' python {output_path.name}')
else:
diff --git a/browser_use/code_use/README.md b/browser_use/code_use/README.md
deleted file mode 100644
index 862c3fbe8..000000000
--- a/browser_use/code_use/README.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# Code-Use Mode
-
-Code-Use Mode is a Notebook-like code execution system for browser automation. Instead of the agent choosing from a predefined set of actions, the LLM writes Python code that gets executed in a persistent namespace with all browser control functions available.
-
-## Problem Solved
-
-**Code-Use Mode solves this** by giving the agent a Python execution environment where it can:
-- Store extracted data in variables
-- Loop through pages programmatically
-- Combine results from multiple extractions
-- Process and filter data before saving
-- Use conditional logic to decide what to do next
-- Output more tokens than the LLM writes
-
-### Namespace
-The namespace is initialized with:
-
-**Browser Control Functions:**
-- `navigate(url)` - Navigate to a URL
-- `click(index)` - Click an element
-- `input(index, text)` - Type text
-- `scroll(down, pages)` - Scroll the page
-- `upload_file(path)` - Upload a file
-- `evaluate(code, variables={})` - Execute JavaScript
-- `done(text, success, files_to_display=[])` - Mark task complete
-
-**Custom evaluate() Function:**
-```python
-# Returns values directly, not wrapped in ActionResult
-result = await evaluate('''
-(function(){
- return Array.from(document.querySelectorAll('.product')).map(p => ({
- name: p.querySelector('.name').textContent,
- price: p.querySelector('.price').textContent
- }))
-})()
-''')
-# result is now a list of dicts, ready to use!
-```
-
-**Utilities:**
-The agent can just utilize packages like `requests`, `pandas`, `numpy`, `matplotlib`, `BeautifulSoup`, `tabulate`, `csv`, ...
-
-The agent will write code like:
-
-### Step 1: Navigate
-```python
-# Navigate to first page
-await navigate(url='https://example.com/products?page=1')
-```
-### Step 2 analyse our DOM state and write code to extract the data we need.
-
-```js extract_products
-(function(){
- return Array.from(document.querySelectorAll('.product')).map(p => ({
- name: p.querySelector('.name')?.textContent || '',
- price: p.querySelector('.price')?.textContent || '',
- rating: p.querySelector('.rating')?.textContent || ''
- }))
-})()
-```
-
-```python
-# Extract products using JavaScript
-all_products = []
-for page in range(1, 6):
- if page > 1:
- await navigate(url=f'https://example.com/products?page={page}')
-
- products = await evaluate(extract_products)
- all_products.extend(products)
- print(f'Page {page}: Found {len(products)} products')
-```
-
-### Step 3: Analyse output & save the data to a file
-```python
-# Save to file
-import json
-with open('products.json', 'w') as f:
- json.dump(all_products, f, indent=2)
-
-print(f'Total: {len(all_products)} products saved to products.json')
-await done(text='Extracted all products', success=True, files_to_display=['products.json'])
-```
diff --git a/browser_use/code_use/__init__.py b/browser_use/code_use/__init__.py
deleted file mode 100644
index 9f304b30f..000000000
--- a/browser_use/code_use/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""Code-use mode - Jupyter notebook-like code execution for browser automation."""
-
-from browser_use.code_use.namespace import create_namespace
-from browser_use.code_use.notebook_export import export_to_ipynb, session_to_python_script
-from browser_use.code_use.service import CodeAgent
-from browser_use.code_use.views import CodeCell, ExecutionStatus, NotebookSession
-
-__all__ = [
- 'CodeAgent',
- 'create_namespace',
- 'export_to_ipynb',
- 'session_to_python_script',
- 'CodeCell',
- 'ExecutionStatus',
- 'NotebookSession',
-]
diff --git a/browser_use/code_use/formatting.py b/browser_use/code_use/formatting.py
deleted file mode 100644
index d5e50d919..000000000
--- a/browser_use/code_use/formatting.py
+++ /dev/null
@@ -1,190 +0,0 @@
-"""Browser state formatting helpers for code-use agent."""
-
-import logging
-from typing import Any
-
-from browser_use.browser.session import BrowserSession
-from browser_use.browser.views import BrowserStateSummary
-
-logger = logging.getLogger(__name__)
-
-
-async def format_browser_state_for_llm(
- state: BrowserStateSummary,
- namespace: dict[str, Any],
- browser_session: BrowserSession,
-) -> str:
- """
- Format browser state summary for LLM consumption in code-use mode.
-
- Args:
- state: Browser state summary from browser_session.get_browser_state_summary()
- namespace: The code execution namespace (for showing available variables)
- browser_session: Browser session for additional checks (jQuery, etc.)
-
- Returns:
- Formatted browser state text for LLM
- """
- assert state.dom_state is not None
- dom_state = state.dom_state
-
- # Use eval_representation (compact serializer for code agents)
- dom_html = dom_state.eval_representation()
- if dom_html == '':
- dom_html = 'Empty DOM tree (you might have to wait for the page to load)'
-
- # Format with URL and title header
- lines = ['## Browser State']
- lines.append(f'**URL:** {state.url}')
- lines.append(f'**Title:** {state.title}')
- lines.append('')
-
- # Add tabs info if multiple tabs exist
- if len(state.tabs) > 1:
- lines.append('**Tabs:**')
- current_target_candidates = []
- # Find tabs that match current URL and title
- for tab in state.tabs:
- if tab.url == state.url and tab.title == state.title:
- current_target_candidates.append(tab.target_id)
- current_target_id = current_target_candidates[0] if len(current_target_candidates) == 1 else None
-
- for tab in state.tabs:
- is_current = ' (current)' if tab.target_id == current_target_id else ''
- lines.append(f' - Tab {tab.target_id[-4:]}: {tab.url} - {tab.title[:30]}{is_current}')
- lines.append('')
-
- # Add page scroll info if available
- if state.page_info:
- pi = state.page_info
- pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
- pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
- total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
-
- scroll_info = f'**Page:** {pages_above:.1f} pages above, {pages_below:.1f} pages below'
- if total_pages > 1.2: # Only mention total if significantly > 1 page
- scroll_info += f', {total_pages:.1f} total pages'
- lines.append(scroll_info)
- lines.append('')
-
- # Add network loading info if there are pending requests
- if state.pending_network_requests:
- # Remove duplicates by URL (keep first occurrence with earliest duration)
- seen_urls = set()
- unique_requests = []
- for req in state.pending_network_requests:
- if req.url not in seen_urls:
- seen_urls.add(req.url)
- unique_requests.append(req)
-
- lines.append(f'**⏳ Loading:** {len(unique_requests)} network requests still loading')
- # Show up to 20 unique requests with truncated URLs (30 chars max)
- for req in unique_requests[:20]:
- duration_sec = req.loading_duration_ms / 1000
- url_display = req.url if len(req.url) <= 30 else req.url[:27] + '...'
- logger.info(f' - [{duration_sec:.1f}s] {url_display}')
- lines.append(f' - [{duration_sec:.1f}s] {url_display}')
- if len(unique_requests) > 20:
- lines.append(f' - ... and {len(unique_requests) - 20} more')
- lines.append('**Tip:** Content may still be loading. Consider waiting with `await asyncio.sleep(1)` if data is missing.')
- lines.append('')
-
- # Add available variables and functions BEFORE DOM structure
- # Show useful utilities (json, asyncio, etc.) and user-defined vars, but hide system objects
- skip_vars = {
- 'browser',
- 'file_system', # System objects
- 'np',
- 'pd',
- 'plt',
- 'numpy',
- 'pandas',
- 'matplotlib',
- 'requests',
- 'BeautifulSoup',
- 'bs4',
- 'pypdf',
- 'PdfReader',
- 'wait',
- }
-
- # Highlight code block variables separately from regular variables
- code_block_vars = []
- regular_vars = []
- tracked_code_blocks = namespace.get('_code_block_vars', set())
- for name in namespace.keys():
- # Skip private vars and system objects/actions
- if not name.startswith('_') and name not in skip_vars:
- if name in tracked_code_blocks:
- code_block_vars.append(name)
- else:
- regular_vars.append(name)
-
- # Sort for consistent display
- available_vars_sorted = sorted(regular_vars)
- code_block_vars_sorted = sorted(code_block_vars)
-
- # Build available line with code blocks and variables
- parts = []
- if code_block_vars_sorted:
- # Show detailed info for code block variables
- code_block_details = []
- for var_name in code_block_vars_sorted:
- value = namespace.get(var_name)
- if value is not None:
- type_name = type(value).__name__
- value_str = str(value) if not isinstance(value, str) else value
-
- # Check if it's a function (starts with "(function" or "(async function")
- is_function = value_str.strip().startswith('(function') or value_str.strip().startswith('(async function')
-
- if is_function:
- # For functions, only show name and type
- detail = f'{var_name}({type_name})'
- else:
- # For non-functions, show first and last 20 chars
- first_20 = value_str[:20].replace('\n', '\\n').replace('\t', '\\t')
- last_20 = value_str[-20:].replace('\n', '\\n').replace('\t', '\\t') if len(value_str) > 20 else ''
-
- if last_20 and first_20 != last_20:
- detail = f'{var_name}({type_name}): "{first_20}...{last_20}"'
- else:
- detail = f'{var_name}({type_name}): "{first_20}"'
- code_block_details.append(detail)
-
- parts.append(f'**Code block variables:** {" | ".join(code_block_details)}')
- if available_vars_sorted:
- parts.append(f'**Variables:** {", ".join(available_vars_sorted)}')
-
- lines.append(f'**Available:** {" | ".join(parts)}')
- lines.append('')
-
- # Add DOM structure
- lines.append('**DOM Structure:**')
-
- # Add scroll position hints for DOM
- if state.page_info:
- pi = state.page_info
- pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
- pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
-
- if pages_above > 0:
- dom_html = f'... {pages_above:.1f} pages above \n{dom_html}'
- else:
- dom_html = '[Start of page]\n' + dom_html
-
- if pages_below <= 0:
- dom_html += '\n[End of page]'
-
- # Truncate DOM if too long and notify LLM
- max_dom_length = 60000
- if len(dom_html) > max_dom_length:
- lines.append(dom_html[:max_dom_length])
- lines.append(
- f'\n[DOM truncated after {max_dom_length} characters. Full page contains {len(dom_html)} characters total. Use evaluate to explore more.]'
- )
- else:
- lines.append(dom_html)
-
- browser_state_text = '\n'.join(lines)
- return browser_state_text
diff --git a/browser_use/code_use/namespace.py b/browser_use/code_use/namespace.py
deleted file mode 100644
index 5f1f4a260..000000000
--- a/browser_use/code_use/namespace.py
+++ /dev/null
@@ -1,665 +0,0 @@
-"""Namespace initialization for code-use mode.
-
-This module creates a namespace with all browser tools available as functions,
-similar to a Jupyter notebook environment.
-"""
-
-import asyncio
-import csv
-import datetime
-import json
-import logging
-import re
-from pathlib import Path
-from typing import Any
-
-import requests
-
-from browser_use.browser import BrowserSession
-from browser_use.filesystem.file_system import FileSystem
-from browser_use.llm.base import BaseChatModel
-from browser_use.tools.service import CodeAgentTools, Tools
-
-logger = logging.getLogger(__name__)
-
-# Try to import optional data science libraries
-try:
- import numpy as np # type: ignore
-
- NUMPY_AVAILABLE = True
-except ImportError:
- NUMPY_AVAILABLE = False
-
-try:
- import pandas as pd # type: ignore
-
- PANDAS_AVAILABLE = True
-except ImportError:
- PANDAS_AVAILABLE = False
-
-try:
- import matplotlib.pyplot as plt # type: ignore
-
- MATPLOTLIB_AVAILABLE = True
-except ImportError:
- MATPLOTLIB_AVAILABLE = False
-
-try:
- from bs4 import BeautifulSoup # type: ignore
-
- BS4_AVAILABLE = True
-except ImportError:
- BS4_AVAILABLE = False
-
-try:
- from pypdf import PdfReader # type: ignore
-
- PYPDF_AVAILABLE = True
-except ImportError:
- PYPDF_AVAILABLE = False
-
-try:
- from tabulate import tabulate # type: ignore
-
- TABULATE_AVAILABLE = True
-except ImportError:
- TABULATE_AVAILABLE = False
-
-
-def _strip_js_comments(js_code: str) -> str:
- """
- Remove JavaScript comments before CDP evaluation.
- CDP's Runtime.evaluate doesn't handle comments in all contexts.
-
- Args:
- js_code: JavaScript code potentially containing comments
-
- Returns:
- JavaScript code with comments stripped
- """
- # Remove multi-line comments (/* ... */)
- js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)
-
- # Remove single-line comments - only lines that START with // (after whitespace)
- # This avoids breaking XPath strings, URLs, regex patterns, etc.
- js_code = re.sub(r'^\s*//.*$', '', js_code, flags=re.MULTILINE)
-
- return js_code
-
-
-class EvaluateError(Exception):
- """Special exception raised by evaluate() to stop Python execution immediately."""
-
- pass
-
-
-async def validate_task_completion(
- task: str,
- output: str | None,
- llm: BaseChatModel,
-) -> tuple[bool, str]:
- """
- Validate if task is truly complete by asking LLM without system prompt or history.
-
- Args:
- task: The original task description
- output: The output from the done() call
- llm: The LLM to use for validation
-
- Returns:
- Tuple of (is_complete, reasoning)
- """
- from browser_use.llm.messages import UserMessage
-
- # Build validation prompt
- validation_prompt = f"""You are a task completion validator. Analyze if the agent has truly completed the user's task.
-
-**Original Task:**
-{task}
-
-**Agent's Output:**
-{output[:100000] if output else '(No output provided)'}
-
-**Your Task:**
-Determine if the agent has successfully completed the user's task. Consider:
-1. Has the agent delivered what the user requested?
-2. If data extraction was requested, is there actual data?
-3. If the task is impossible (e.g., localhost website, login required but no credentials), is it truly impossible?
-4. Could the agent continue and make meaningful progress?
-
-**Response Format:**
-Reasoning: [Your analysis of whether the task is complete]
-Verdict: [YES or NO]
-
-YES = Task is complete OR truly impossible to complete
-NO = Agent should continue working"""
-
- try:
- # Call LLM with just the validation prompt (no system prompt, no history)
- response = await llm.ainvoke([UserMessage(content=validation_prompt)])
- response_text = response.completion
-
- # Parse the response
- reasoning = ''
- verdict = 'NO'
-
- # Extract reasoning and verdict
- lines = response_text.split('\n')
- for line in lines:
- if line.strip().lower().startswith('reasoning:'):
- reasoning = line.split(':', 1)[1].strip()
- elif line.strip().lower().startswith('verdict:'):
- verdict_text = line.split(':', 1)[1].strip().upper()
- if 'YES' in verdict_text:
- verdict = 'YES'
- elif 'NO' in verdict_text:
- verdict = 'NO'
-
- # If we couldn't parse, try to find YES/NO in the response
- if not reasoning:
- reasoning = response_text
-
- is_complete = verdict == 'YES'
-
- logger.info(f'Task validation: {verdict}')
- logger.debug(f'Validation reasoning: {reasoning}')
-
- return is_complete, reasoning
-
- except Exception as e:
- logger.warning(f'Failed to validate task completion: {e}')
- # On error, assume the agent knows what they're doing
- return True, f'Validation failed: {e}'
-
-
-async def evaluate(code: str, browser_session: BrowserSession) -> Any:
- """
- Execute JavaScript code in the browser and return the result.
-
- Args:
- code: JavaScript code to execute (must be wrapped in IIFE)
-
- Returns:
- The result of the JavaScript execution
-
- Raises:
- EvaluateError: If JavaScript execution fails. This stops Python execution immediately.
-
- Example:
- result = await evaluate('''
- (function(){
- return Array.from(document.querySelectorAll('.product')).map(p => ({
- name: p.querySelector('.name').textContent,
- price: p.querySelector('.price').textContent
- }))
- })()
- ''')
- """
- # Strip JavaScript comments before CDP evaluation (CDP doesn't support them in all contexts)
- code = _strip_js_comments(code)
-
- cdp_session = await browser_session.get_or_create_cdp_session()
-
- try:
- # Execute JavaScript with proper error handling
- result = await cdp_session.cdp_client.send.Runtime.evaluate(
- params={'expression': code, 'returnByValue': True, 'awaitPromise': True},
- session_id=cdp_session.session_id,
- )
-
- # Check for JavaScript execution errors
- if result.get('exceptionDetails'):
- exception = result['exceptionDetails']
- error_text = exception.get('text', 'Unknown error')
-
- # Try to get more details from the exception
- error_details = []
- if 'exception' in exception:
- exc_obj = exception['exception']
- if 'description' in exc_obj:
- error_details.append(exc_obj['description'])
- elif 'value' in exc_obj:
- error_details.append(str(exc_obj['value']))
-
- # Build comprehensive error message with full CDP context
- error_msg = f'JavaScript execution error: {error_text}'
- if error_details:
- error_msg += f'\nDetails: {" | ".join(error_details)}'
-
- # Raise special exception that will stop Python execution immediately
- raise EvaluateError(error_msg)
-
- # Get the result data
- result_data = result.get('result', {})
-
- # Get the actual value
- value = result_data.get('value')
-
- # Return the value directly
- if value is None:
- return None if 'value' in result_data else 'undefined'
- elif isinstance(value, (dict, list)):
- # Complex objects - already deserialized by returnByValue
- return value
- else:
- # Primitive values
- return value
-
- except EvaluateError:
- # Re-raise EvaluateError as-is to stop Python execution
- raise
- except Exception as e:
- # Wrap other exceptions in EvaluateError
- raise EvaluateError(f'Failed to execute JavaScript: {type(e).__name__}: {e}') from e
-
-
-def create_namespace(
- browser_session: BrowserSession,
- tools: Tools | None = None,
- page_extraction_llm: BaseChatModel | None = None,
- file_system: FileSystem | None = None,
- available_file_paths: list[str] | None = None,
- sensitive_data: dict[str, str | dict[str, str]] | None = None,
-) -> dict[str, Any]:
- """
- Create a namespace with all browser tools available as functions.
-
- This function creates a dictionary of functions that can be used to interact
- with the browser, similar to a Jupyter notebook environment.
-
- Args:
- browser_session: The browser session to use
- tools: Optional Tools instance (will create default if not provided)
- page_extraction_llm: Optional LLM for page extraction
- file_system: Optional file system for file operations
- available_file_paths: Optional list of available file paths
- sensitive_data: Optional sensitive data dictionary
-
- Returns:
- Dictionary containing all available functions and objects
-
- Example:
- namespace = create_namespace(browser_session)
- await namespace['navigate'](url='https://google.com')
- result = await namespace['evaluate']('document.title')
- """
- if tools is None:
- # Use CodeAgentTools with default exclusions optimized for code-use mode
- # For code-use, we keep: navigate, evaluate, wait, done
- # and exclude: most browser interaction, file system actions (use Python instead)
- tools = CodeAgentTools()
-
- if available_file_paths is None:
- available_file_paths = []
-
- namespace: dict[str, Any] = {
- # Core objects
- 'browser': browser_session,
- 'file_system': file_system,
- # Standard library modules (always available)
- 'json': json,
- 'asyncio': asyncio,
- 'Path': Path,
- 'csv': csv,
- 're': re,
- 'datetime': datetime,
- 'requests': requests,
- }
-
- # Add optional data science libraries if available
- if NUMPY_AVAILABLE:
- namespace['np'] = np
- namespace['numpy'] = np
- if PANDAS_AVAILABLE:
- namespace['pd'] = pd
- namespace['pandas'] = pd
- if MATPLOTLIB_AVAILABLE:
- namespace['plt'] = plt
- namespace['matplotlib'] = plt
- if BS4_AVAILABLE:
- namespace['BeautifulSoup'] = BeautifulSoup
- namespace['bs4'] = BeautifulSoup
- if PYPDF_AVAILABLE:
- namespace['PdfReader'] = PdfReader
- namespace['pypdf'] = PdfReader
- if TABULATE_AVAILABLE:
- namespace['tabulate'] = tabulate
-
- # Track failed evaluate() calls to detect repeated failed approaches
- if '_evaluate_failures' not in namespace:
- namespace['_evaluate_failures'] = []
-
- # Add custom evaluate function that returns values directly
- async def evaluate_wrapper(
- code: str | None = None, variables: dict[str, Any] | None = None, *_args: Any, **kwargs: Any
- ) -> Any:
- # Handle both positional and keyword argument styles
- if code is None:
- # Check if code was passed as keyword arg
- code = kwargs.get('code', kwargs.get('js_code', kwargs.get('expression', '')))
- # Extract variables if passed as kwarg
- if variables is None:
- variables = kwargs.get('variables')
-
- if not code:
- raise ValueError('No JavaScript code provided to evaluate()')
-
- # Inject variables if provided
- if variables:
- vars_json = json.dumps(variables)
- stripped = code.strip()
-
- # Check if code is already a function expression expecting params
- # Pattern: (function(params) { ... }) or (async function(params) { ... })
- if re.match(r'\((?:async\s+)?function\s*\(\s*\w+\s*\)', stripped):
- # Already expects params, wrap to call it with our variables
- code = f'(function(){{ const params = {vars_json}; return {stripped}(params); }})()'
- else:
- # Not a parameterized function, inject params in scope
- # Check if already wrapped in IIFE (including arrow function IIFEs)
- is_wrapped = (
- (stripped.startswith('(function()') and '})()' in stripped[-10:])
- or (stripped.startswith('(async function()') and '})()' in stripped[-10:])
- or (stripped.startswith('(() =>') and ')()' in stripped[-10:])
- or (stripped.startswith('(async () =>') and ')()' in stripped[-10:])
- )
- if is_wrapped:
- # Already wrapped, inject params at the start
- # Try to match regular function IIFE
- match = re.match(r'(\((?:async\s+)?function\s*\(\s*\)\s*\{)', stripped)
- if match:
- prefix = match.group(1)
- rest = stripped[len(prefix) :]
- code = f'{prefix} const params = {vars_json}; {rest}'
- else:
- # Try to match arrow function IIFE
- # Patterns: (() => expr)() or (() => { ... })() or (async () => ...)()
- arrow_match = re.match(r'(\((?:async\s+)?\(\s*\)\s*=>\s*\{)', stripped)
- if arrow_match:
- # Arrow function with block body: (() => { ... })()
- prefix = arrow_match.group(1)
- rest = stripped[len(prefix) :]
- code = f'{prefix} const params = {vars_json}; {rest}'
- else:
- # Arrow function with expression body or fallback: wrap in outer function
- code = f'(function(){{ const params = {vars_json}; return {stripped}; }})()'
- else:
- # Not wrapped, wrap with params
- code = f'(function(){{ const params = {vars_json}; {code} }})()'
- # Skip auto-wrap below
- return await evaluate(code, browser_session)
-
- # Auto-wrap in IIFE if not already wrapped (and no variables were injected)
- if not variables:
- stripped = code.strip()
- # Check for regular function IIFEs, async function IIFEs, and arrow function IIFEs
- is_wrapped = (
- (stripped.startswith('(function()') and '})()' in stripped[-10:])
- or (stripped.startswith('(async function()') and '})()' in stripped[-10:])
- or (stripped.startswith('(() =>') and ')()' in stripped[-10:])
- or (stripped.startswith('(async () =>') and ')()' in stripped[-10:])
- )
- if not is_wrapped:
- code = f'(function(){{{code}}})()'
-
- # Execute and track failures
- try:
- result = await evaluate(code, browser_session)
-
- # Print result structure for debugging
- if isinstance(result, list) and result and isinstance(result[0], dict):
- result_preview = f'list of dicts - len={len(result)}, example 1:\n'
- sample_result = result[0]
- for key, value in list(sample_result.items())[:10]:
- value_str = str(value)[:10] if not isinstance(value, (int, float, bool, type(None))) else str(value)
- result_preview += f' {key}: {value_str}...\n'
- if len(sample_result) > 10:
- result_preview += f' ... {len(sample_result) - 10} more keys'
- print(result_preview)
-
- elif isinstance(result, list):
- if len(result) == 0:
- print('type=list, len=0')
- else:
- result_preview = str(result)[:100]
- print(f'type=list, len={len(result)}, preview={result_preview}...')
- elif isinstance(result, dict):
- result_preview = f'type=dict, len={len(result)}, sample keys:\n'
- for key, value in list(result.items())[:10]:
- value_str = str(value)[:10] if not isinstance(value, (int, float, bool, type(None))) else str(value)
- result_preview += f' {key}: {value_str}...\n'
- if len(result) > 10:
- result_preview += f' ... {len(result) - 10} more keys'
- print(result_preview)
-
- else:
- print(f'type={type(result).__name__}, value={repr(result)[:50]}')
-
- return result
- except Exception as e:
- # Track errors for pattern detection
- namespace['_evaluate_failures'].append({'error': str(e), 'type': 'exception'})
- raise
-
- namespace['evaluate'] = evaluate_wrapper
-
- # Add get_selector_from_index helper for code_use mode
- async def get_selector_from_index_wrapper(index: int) -> str:
- """
- Get the CSS selector for an element by its interactive index.
-
- This allows you to use the element's index from the browser state to get
- its CSS selector for use in JavaScript evaluate() calls.
-
- Args:
- index: The interactive index from the browser state (e.g., [123])
-
- Returns:
- str: CSS selector that can be used in JavaScript
-
- Example:
- selector = await get_selector_from_index(123)
- await evaluate(f'''
- (function(){{
- const el = document.querySelector({json.dumps(selector)});
- if (el) el.click();
- }})()
- ''')
- """
- from browser_use.dom.utils import generate_css_selector_for_element
-
- # Get element by index from browser session
- node = await browser_session.get_element_by_index(index)
- if node is None:
- msg = f'Element index {index} not available - page may have changed. Try refreshing browser state.'
- logger.warning(f'⚠️ {msg}')
- raise RuntimeError(msg)
-
- # Check if element is in shadow DOM
- shadow_hosts = []
- current = node.parent_node
- while current:
- if current.shadow_root_type is not None:
- # This is a shadow host
- host_tag = current.tag_name.lower()
- host_id = current.attributes.get('id', '') if current.attributes else ''
- host_desc = f'{host_tag}#{host_id}' if host_id else host_tag
- shadow_hosts.insert(0, host_desc)
- current = current.parent_node
-
- # Check if in iframe
- in_iframe = False
- current = node.parent_node
- while current:
- if current.tag_name.lower() == 'iframe':
- in_iframe = True
- break
- current = current.parent_node
-
- # Use the robust selector generation function (now handles special chars in IDs)
- selector = generate_css_selector_for_element(node)
-
- # Log shadow DOM/iframe info if detected
- if shadow_hosts:
- shadow_path = ' > '.join(shadow_hosts)
- logger.info(f'Element [{index}] is inside Shadow DOM. Path: {shadow_path}')
- logger.info(f' Selector: {selector}')
- logger.info(
- f' To access: document.querySelector("{shadow_hosts[0].split("#")[0]}").shadowRoot.querySelector("{selector}")'
- )
- if in_iframe:
- logger.info(f"Element [{index}] is inside an iframe. Regular querySelector won't work.")
-
- if selector:
- return selector
-
- # Fallback: just use tag name if available
- if node.tag_name:
- return node.tag_name.lower()
-
- raise ValueError(f'Could not generate selector for element index {index}')
-
- namespace['get_selector_from_index'] = get_selector_from_index_wrapper
-
- # Inject all tools as functions into the namespace
- # Skip 'evaluate' since we have a custom implementation above
- for action_name, action in tools.registry.registry.actions.items():
- if action_name == 'evaluate':
- continue # Skip - use custom evaluate that returns Python objects directly
- param_model = action.param_model
- action_function = action.function
-
- # Create a closure to capture the current action_name, param_model, and action_function
- def make_action_wrapper(act_name, par_model, act_func):
- async def action_wrapper(*args, **kwargs):
- # Convert positional args to kwargs based on param model fields
- if args:
- # Get the field names from the pydantic model
- field_names = list(par_model.model_fields.keys())
- for i, arg in enumerate(args):
- if i < len(field_names):
- kwargs[field_names[i]] = arg
-
- # Create params from kwargs
- try:
- params = par_model(**kwargs)
- except Exception as e:
- raise ValueError(f'Invalid parameters for {act_name}: {e}') from e
-
- # Special validation for done() - enforce minimal code cell
- if act_name == 'done':
- consecutive_failures = namespace.get('_consecutive_errors')
- if consecutive_failures and consecutive_failures > 3:
- pass
-
- else:
- # Check if there are multiple Python blocks in this response
- all_blocks = namespace.get('_all_code_blocks', {})
- python_blocks = [k for k in sorted(all_blocks.keys()) if k.startswith('python_')]
-
- if len(python_blocks) > 1:
- msg = (
- 'done() should be the ONLY code block in the response.\n'
- 'You have multiple Python blocks in this response. Consider calling done() in a separate response '
- 'Now verify the last output and if it satisfies the task, call done(), else continue working.'
- )
- print(msg)
-
- # Get the current cell code from namespace (injected by service.py before execution)
- current_code = namespace.get('_current_cell_code')
- if current_code and isinstance(current_code, str):
- # Count non-empty, non-comment lines
- lines = [line.strip() for line in current_code.strip().split('\n')]
- code_lines = [line for line in lines if line and not line.startswith('#')]
-
- # Check if the line above await done() contains an if block
- done_line_index = -1
- for i, line in enumerate(reversed(code_lines)):
- if 'await done()' in line or 'await done(' in line:
- done_line_index = len(code_lines) - 1 - i
- break
-
- has_if_above = False
- has_else_above = False
- has_elif_above = False
- if done_line_index > 0:
- line_above = code_lines[done_line_index - 1]
- has_if_above = line_above.strip().startswith('if ') and line_above.strip().endswith(':')
- has_else_above = line_above.strip().startswith('else:')
- has_elif_above = line_above.strip().startswith('elif ')
- if has_if_above or has_else_above or has_elif_above:
- msg = (
- 'done() should be called individually after verifying the result from any logic.\n'
- 'Consider validating your output first, THEN call done() in a final step without if/else/elif blocks only if the task is truly complete.'
- )
- logger.error(msg)
- print(msg)
- raise RuntimeError(msg)
-
- # Build special context
- special_context = {
- 'browser_session': browser_session,
- 'page_extraction_llm': page_extraction_llm,
- 'available_file_paths': available_file_paths,
- 'has_sensitive_data': False, # Can be handled separately if needed
- 'file_system': file_system,
- }
-
- # Execute the action
- result = await act_func(params=params, **special_context)
-
- # For code-use mode, we want to return the result directly
- # not wrapped in ActionResult
- if hasattr(result, 'extracted_content'):
- # Special handling for done action - mark task as complete
- if act_name == 'done' and hasattr(result, 'is_done') and result.is_done:
- namespace['_task_done'] = True
- # Store the extracted content as the final result
- if result.extracted_content:
- namespace['_task_result'] = result.extracted_content
- # Store the self-reported success status
- if hasattr(result, 'success'):
- namespace['_task_success'] = result.success
-
- # If there's extracted content, return it
- if result.extracted_content:
- return result.extracted_content
- # If there's an error, raise it
- if result.error:
- raise RuntimeError(result.error)
- # Otherwise return None
- return None
- return result
-
- return action_wrapper
-
- # Rename 'input' to 'input_text' to avoid shadowing Python's built-in input()
- namespace_action_name = 'input_text' if action_name == 'input' else action_name
-
- # Add the wrapper to the namespace
- namespace[namespace_action_name] = make_action_wrapper(action_name, param_model, action_function)
-
- return namespace
-
-
-def get_namespace_documentation(namespace: dict[str, Any]) -> str:
- """
- Generate documentation for all available functions in the namespace.
-
- Args:
- namespace: The namespace dictionary
-
- Returns:
- Markdown-formatted documentation string
- """
- docs = ['# Available Functions\n']
-
- # Document each function
- for name, obj in sorted(namespace.items()):
- if callable(obj) and not name.startswith('_'):
- # Get function signature and docstring
- if hasattr(obj, '__doc__') and obj.__doc__:
- docs.append(f'## {name}\n')
- docs.append(f'{obj.__doc__}\n')
-
- return '\n'.join(docs)
diff --git a/browser_use/code_use/notebook_export.py b/browser_use/code_use/notebook_export.py
deleted file mode 100644
index b3defaed0..000000000
--- a/browser_use/code_use/notebook_export.py
+++ /dev/null
@@ -1,276 +0,0 @@
-"""Export code-use session to Jupyter notebook format."""
-
-import json
-import re
-from pathlib import Path
-
-from browser_use.code_use.service import CodeAgent
-
-from .views import CellType, NotebookExport
-
-
-def export_to_ipynb(agent: CodeAgent, output_path: str | Path) -> Path:
- """
- Export a NotebookSession to a Jupyter notebook (.ipynb) file.
- Now includes JavaScript code blocks that were stored in the namespace.
-
- Args:
- session: The NotebookSession to export
- output_path: Path where to save the notebook file
- agent: Optional CodeAgent instance to access namespace for JavaScript blocks
-
- Returns:
- Path to the saved notebook file
-
- Example:
- ```python
- session = await agent.run()
- notebook_path = export_to_ipynb(agent, 'my_automation.ipynb')
- print(f'Notebook saved to {notebook_path}')
- ```
- """
- output_path = Path(output_path)
-
- # Create notebook structure
- notebook = NotebookExport(
- metadata={
- 'kernelspec': {'display_name': 'Python 3', 'language': 'python', 'name': 'python3'},
- 'language_info': {
- 'name': 'python',
- 'version': '3.11.0',
- 'mimetype': 'text/x-python',
- 'codemirror_mode': {'name': 'ipython', 'version': 3},
- 'pygments_lexer': 'ipython3',
- 'nbconvert_exporter': 'python',
- 'file_extension': '.py',
- },
- }
- )
-
- # Add setup cell at the beginning with proper type hints
- setup_code = """import asyncio
-import json
-from typing import Any
-from browser_use import BrowserSession
-from browser_use.code_use import create_namespace
-
-# Initialize browser and namespace
-browser = BrowserSession()
-await browser.start()
-
-# Create namespace with all browser control functions
-namespace: dict[str, Any] = create_namespace(browser)
-
-# Import all functions into the current namespace
-globals().update(namespace)
-
-# Type hints for better IDE support (these are now available globally)
-# navigate, click, input, evaluate, search, extract, scroll, done, etc.
-
-print("Browser-use environment initialized!")
-print("Available functions: navigate, click, input, evaluate, search, extract, done, etc.")"""
-
- setup_cell = {
- 'cell_type': 'code',
- 'metadata': {},
- 'source': setup_code.split('\n'),
- 'execution_count': None,
- 'outputs': [],
- }
- notebook.cells.append(setup_cell)
-
- # Add JavaScript code blocks as variables FIRST
- if hasattr(agent, 'namespace') and agent.namespace:
- # Look for JavaScript variables in the namespace
- code_block_vars = agent.namespace.get('_code_block_vars', set())
-
- for var_name in sorted(code_block_vars):
- var_value = agent.namespace.get(var_name)
- if isinstance(var_value, str) and var_value.strip():
- # Check if this looks like JavaScript code
- # Look for common JS patterns
- js_patterns = [
- r'function\s+\w+\s*\(',
- r'\(\s*function\s*\(\)',
- r'=>\s*{',
- r'document\.',
- r'Array\.from\(',
- r'\.querySelector',
- r'\.textContent',
- r'\.innerHTML',
- r'return\s+',
- r'console\.log',
- r'window\.',
- r'\.map\(',
- r'\.filter\(',
- r'\.forEach\(',
- ]
-
- is_js = any(re.search(pattern, var_value, re.IGNORECASE) for pattern in js_patterns)
-
- if is_js:
- # Create a code cell with the JavaScript variable
- js_cell = {
- 'cell_type': 'code',
- 'metadata': {},
- 'source': [f'# JavaScript Code Block: {var_name}\n', f'{var_name} = """{var_value}"""'],
- 'execution_count': None,
- 'outputs': [],
- }
- notebook.cells.append(js_cell)
-
- # Convert cells
- python_cell_count = 0
- for cell in agent.session.cells:
- notebook_cell: dict = {
- 'cell_type': cell.cell_type.value,
- 'metadata': {},
- 'source': cell.source.splitlines(keepends=True),
- }
-
- if cell.cell_type == CellType.CODE:
- python_cell_count += 1
- notebook_cell['execution_count'] = cell.execution_count
- notebook_cell['outputs'] = []
-
- # Add output if available
- if cell.output:
- notebook_cell['outputs'].append(
- {
- 'output_type': 'stream',
- 'name': 'stdout',
- 'text': cell.output.split('\n'),
- }
- )
-
- # Add error if available
- if cell.error:
- notebook_cell['outputs'].append(
- {
- 'output_type': 'error',
- 'ename': 'Error',
- 'evalue': cell.error.split('\n')[0] if cell.error else '',
- 'traceback': cell.error.split('\n') if cell.error else [],
- }
- )
-
- # Add browser state as a separate output
- if cell.browser_state:
- notebook_cell['outputs'].append(
- {
- 'output_type': 'stream',
- 'name': 'stdout',
- 'text': [f'Browser State:\n{cell.browser_state}'],
- }
- )
-
- notebook.cells.append(notebook_cell)
-
- # Write to file
- output_path.parent.mkdir(parents=True, exist_ok=True)
- with open(output_path, 'w', encoding='utf-8') as f:
- json.dump(notebook.model_dump(), f, indent=2, ensure_ascii=False)
-
- return output_path
-
-
-def session_to_python_script(agent: CodeAgent) -> str:
- """
- Convert a CodeAgent session to a Python script.
- Now includes JavaScript code blocks that were stored in the namespace.
-
- Args:
- agent: The CodeAgent instance to convert
-
- Returns:
- Python script as a string
-
- Example:
- ```python
- await agent.run()
- script = session_to_python_script(agent)
- print(script)
- ```
- """
- lines = []
-
- lines.append('# Generated from browser-use code-use session\n')
- lines.append('import asyncio\n')
- lines.append('import json\n')
- lines.append('from browser_use import BrowserSession\n')
- lines.append('from browser_use.code_use import create_namespace\n\n')
-
- lines.append('async def main():\n')
- lines.append('\t# Initialize browser and namespace\n')
- lines.append('\tbrowser = BrowserSession()\n')
- lines.append('\tawait browser.start()\n\n')
- lines.append('\t# Create namespace with all browser control functions\n')
- lines.append('\tnamespace = create_namespace(browser)\n\n')
- lines.append('\t# Extract functions from namespace for direct access\n')
- lines.append('\tnavigate = namespace["navigate"]\n')
- lines.append('\tclick = namespace["click"]\n')
- lines.append('\tinput_text = namespace["input"]\n')
- lines.append('\tevaluate = namespace["evaluate"]\n')
- lines.append('\tsearch = namespace["search"]\n')
- lines.append('\textract = namespace["extract"]\n')
- lines.append('\tscroll = namespace["scroll"]\n')
- lines.append('\tdone = namespace["done"]\n')
- lines.append('\tgo_back = namespace["go_back"]\n')
- lines.append('\twait = namespace["wait"]\n')
- lines.append('\tscreenshot = namespace["screenshot"]\n')
- lines.append('\tfind_text = namespace["find_text"]\n')
- lines.append('\tswitch_tab = namespace["switch"]\n')
- lines.append('\tclose_tab = namespace["close"]\n')
- lines.append('\tdropdown_options = namespace["dropdown_options"]\n')
- lines.append('\tselect_dropdown = namespace["select_dropdown"]\n')
- lines.append('\tupload_file = namespace["upload_file"]\n')
- lines.append('\tsend_keys = namespace["send_keys"]\n\n')
-
- # Add JavaScript code blocks as variables FIRST
- if hasattr(agent, 'namespace') and agent.namespace:
- code_block_vars = agent.namespace.get('_code_block_vars', set())
-
- for var_name in sorted(code_block_vars):
- var_value = agent.namespace.get(var_name)
- if isinstance(var_value, str) and var_value.strip():
- # Check if this looks like JavaScript code
- js_patterns = [
- r'function\s+\w+\s*\(',
- r'\(\s*function\s*\(\)',
- r'=>\s*{',
- r'document\.',
- r'Array\.from\(',
- r'\.querySelector',
- r'\.textContent',
- r'\.innerHTML',
- r'return\s+',
- r'console\.log',
- r'window\.',
- r'\.map\(',
- r'\.filter\(',
- r'\.forEach\(',
- ]
-
- is_js = any(re.search(pattern, var_value, re.IGNORECASE) for pattern in js_patterns)
-
- if is_js:
- lines.append(f'\t# JavaScript Code Block: {var_name}\n')
- lines.append(f'\t{var_name} = """{var_value}"""\n\n')
-
- for i, cell in enumerate(agent.session.cells):
- if cell.cell_type == CellType.CODE:
- lines.append(f'\t# Cell {i + 1}\n')
-
- # Indent each line of source
- source_lines = cell.source.split('\n')
- for line in source_lines:
- if line.strip(): # Only add non-empty lines
- lines.append(f'\t{line}\n')
-
- lines.append('\n')
-
- lines.append('\tawait browser.stop()\n\n')
- lines.append("if __name__ == '__main__':\n")
- lines.append('\tasyncio.run(main())\n')
-
- return ''.join(lines)
diff --git a/browser_use/code_use/service.py b/browser_use/code_use/service.py
deleted file mode 100644
index e4f5b54d3..000000000
--- a/browser_use/code_use/service.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-"""Code-use agent service - Jupyter notebook-like code execution for browser automation."""
-
-import asyncio
-import datetime
-import html
-import json
-import logging
-import re
-import traceback
-from pathlib import Path
-from typing import Any
-
-from uuid_extensions import uuid7str
-
-from browser_use.browser import BrowserSession
-from browser_use.browser.profile import BrowserProfile
-from browser_use.dom.service import DomService
-from browser_use.filesystem.file_system import FileSystem
-from browser_use.llm.base import BaseChatModel
-from browser_use.llm.messages import (
- AssistantMessage,
- BaseMessage,
- ContentPartImageParam,
- ContentPartTextParam,
- ImageURL,
- UserMessage,
-)
-from browser_use.screenshots.service import ScreenshotService
-from browser_use.telemetry.service import ProductTelemetry
-from browser_use.telemetry.views import AgentTelemetryEvent
-from browser_use.tokens.service import TokenCost
-from browser_use.tokens.views import UsageSummary
-from browser_use.tools.service import CodeAgentTools, Tools
-from browser_use.utils import get_browser_use_version
-
-from .formatting import format_browser_state_for_llm
-from .namespace import EvaluateError, create_namespace
-from .utils import detect_token_limit_issue, extract_code_blocks, extract_url_from_task, truncate_message_content
-from .views import (
- CellType,
- CodeAgentHistory,
- CodeAgentHistoryList,
- CodeAgentModelOutput,
- CodeAgentResult,
- CodeAgentState,
- CodeAgentStepMetadata,
- ExecutionStatus,
- NotebookSession,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class CodeAgent:
- """
- Agent that executes Python code in a notebook-like environment for browser automation.
-
- This agent provides a Jupyter notebook-like interface where the LLM writes Python code
- that gets executed in a persistent namespace with browser control functions available.
- """
-
- def __init__(
- self,
- task: str,
- # Optional parameters
- llm: BaseChatModel | None = None,
- browser_session: BrowserSession | None = None,
- browser: BrowserSession | None = None, # Alias for browser_session
- tools: Tools | None = None,
- controller: Tools | None = None, # Alias for tools
- # Agent settings
- page_extraction_llm: BaseChatModel | None = None,
- file_system: FileSystem | None = None,
- available_file_paths: list[str] | None = None,
- sensitive_data: dict[str, str | dict[str, str]] | None = None,
- max_steps: int = 100,
- max_failures: int = 8,
- max_validations: int = 0,
- use_vision: bool = True,
- calculate_cost: bool = False,
- demo_mode: bool | None = None,
- **kwargs,
- ):
- """
- Initialize the code-use agent.
-
- Args:
- task: The task description for the agent
- browser_session: Optional browser session (will be created if not provided) [DEPRECATED: use browser]
- browser: Optional browser session (cleaner API)
- tools: Optional Tools instance (will create default if not provided)
- controller: Optional Tools instance
- page_extraction_llm: Optional LLM for page extraction
- file_system: Optional file system for file operations
- available_file_paths: Optional list of available file paths
- sensitive_data: Optional sensitive data dictionary
- max_steps: Maximum number of execution steps
- max_failures: Maximum consecutive errors before termination (default: 8)
- max_validations: Maximum number of times to run the validator agent (default: 0)
- use_vision: Whether to include screenshots in LLM messages (default: True)
- calculate_cost: Whether to calculate token costs (default: False)
- demo_mode: Enable the in-browser demo panel for live logging (default: False)
- llm: Optional ChatBrowserUse LLM instance (will create default if not provided)
- **kwargs: Additional keyword arguments for compatibility (ignored)
- """
- # Log and ignore unknown kwargs for compatibility
- if kwargs:
- logger.debug(f'Ignoring additional kwargs for CodeAgent compatibility: {list(kwargs.keys())}')
-
- if llm is None:
- try:
- from browser_use import ChatBrowserUse
-
- llm = ChatBrowserUse()
- logger.debug('CodeAgent using ChatBrowserUse')
- except Exception as e:
- raise RuntimeError(f'Failed to initialize CodeAgent LLM: {e}')
-
- if 'ChatBrowserUse' not in llm.__class__.__name__:
- raise ValueError('This agent works only with ChatBrowserUse.')
-
- # Handle browser vs browser_session parameter (browser takes precedence)
- if browser and browser_session:
- raise ValueError('Cannot specify both "browser" and "browser_session" parameters. Use "browser" for the cleaner API.')
- browser_session = browser or browser_session
-
- # Handle controller vs tools parameter (controller takes precedence)
- if controller and tools:
- raise ValueError('Cannot specify both "controller" and "tools" parameters. Use "controller" for the cleaner API.')
- tools = controller or tools
-
- # Store browser_profile for creating browser session if needed
- self._demo_mode_enabled = False
- if browser_session is None:
- profile_kwargs: dict[str, Any] = {}
- if demo_mode is not None:
- profile_kwargs['demo_mode'] = demo_mode
- self._browser_profile_for_init = BrowserProfile(**profile_kwargs)
- else:
- self._browser_profile_for_init = None
-
- self.task = task
- self.llm = llm
- self.browser_session = browser_session
- if self.browser_session:
- if demo_mode is not None and self.browser_session.browser_profile.demo_mode != demo_mode:
- self.browser_session.browser_profile = self.browser_session.browser_profile.model_copy(
- update={'demo_mode': demo_mode}
- )
- self._demo_mode_enabled = bool(self.browser_session.browser_profile.demo_mode)
- self.tools = tools or CodeAgentTools()
- self.page_extraction_llm = page_extraction_llm
- self.file_system = file_system if file_system is not None else FileSystem(base_dir='./')
- self.available_file_paths = available_file_paths or []
- self.sensitive_data = sensitive_data
- self.max_steps = max_steps
- self.max_failures = max_failures
- self.max_validations = max_validations
- self.use_vision = use_vision
-
- self.session = NotebookSession()
- self.namespace: dict[str, Any] = {}
- self._llm_messages: list[BaseMessage] = [] # Internal LLM conversation history
- self.complete_history: list[CodeAgentHistory] = [] # Type-safe history with model_output and result
- self.dom_service: DomService | None = None
- self._last_browser_state_text: str | None = None # Track last browser state text
- self._last_screenshot: str | None = None # Track last screenshot (base64)
- self._consecutive_errors = 0 # Track consecutive errors for auto-termination
- self._validation_count = 0 # Track number of validator runs
- self._last_llm_usage: Any | None = None # Track last LLM call usage stats
- self._step_start_time = 0.0 # Track step start time for duration calculation
- self.usage_summary: UsageSummary | None = None # Track usage summary across run for history property
- self._sample_output_added = False # Track whether preview cell already created
-
- # Initialize screenshot service for eval tracking
- self.id = uuid7str()
- timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
- base_tmp = Path('/tmp')
- self.agent_directory = base_tmp / f'browser_use_code_agent_{self.id}_{timestamp}'
- self.screenshot_service = ScreenshotService(agent_directory=self.agent_directory)
-
- # Initialize token cost service for usage tracking
- self.token_cost_service = TokenCost(include_cost=calculate_cost)
- self.token_cost_service.register_llm(llm)
- if page_extraction_llm:
- self.token_cost_service.register_llm(page_extraction_llm)
-
- # Set version and source for telemetry
- self.version = get_browser_use_version()
- try:
- package_root = Path(__file__).parent.parent.parent
- repo_files = ['.git', 'README.md', 'docs', 'examples']
- if all(Path(package_root / file).exists() for file in repo_files):
- self.source = 'git'
- else:
- self.source = 'pip'
- except Exception:
- self.source = 'unknown'
-
- # Telemetry
- self.telemetry = ProductTelemetry()
-
- async def run(self, max_steps: int | None = None) -> NotebookSession:
- """
- Run the agent to complete the task.
-
- Args:
- max_steps: Optional override for maximum number of steps (uses __init__ value if not provided)
-
- Returns:
- The notebook session with all executed cells
- """
- # Use override if provided, otherwise use value from __init__
- steps_to_run = max_steps if max_steps is not None else self.max_steps
- self.max_steps = steps_to_run
- # Start browser if not provided
- if self.browser_session is None:
- assert self._browser_profile_for_init is not None
- self.browser_session = BrowserSession(browser_profile=self._browser_profile_for_init)
- await self.browser_session.start()
-
- if self.browser_session:
- self._demo_mode_enabled = bool(self.browser_session.browser_profile.demo_mode)
- if self._demo_mode_enabled and getattr(self.browser_session.browser_profile, 'headless', False):
- logger.warning('Demo mode is enabled but the browser is headless=True; set headless=False to view the panel.')
- if self._demo_mode_enabled:
- await self._demo_mode_log(f'Started CodeAgent task: {self.task}', 'info', {'tag': 'task'})
-
- # Initialize DOM service with cross-origin iframe support enabled
- self.dom_service = DomService(
- browser_session=self.browser_session,
- cross_origin_iframes=True, # Enable for code-use agent to access forms in iframes
- )
-
- # Create namespace with all tools
- self.namespace = create_namespace(
- browser_session=self.browser_session,
- tools=self.tools,
- page_extraction_llm=self.page_extraction_llm,
- file_system=self.file_system,
- available_file_paths=self.available_file_paths,
- sensitive_data=self.sensitive_data,
- )
-
- # Initialize conversation with task
- self._llm_messages.append(UserMessage(content=f'Task: {self.task}'))
-
- # Track agent run error for telemetry
- agent_run_error: str | None = None
- should_delay_close = False
-
- # Extract URL from task and navigate if found
- initial_url = extract_url_from_task(self.task)
- if initial_url:
- try:
- logger.info(f'Extracted URL from task, navigating to: {initial_url}')
- # Use the navigate action from namespace
- await self.namespace['navigate'](initial_url)
- # Wait for page load
- await asyncio.sleep(2)
-
- # Record this navigation as a cell in the notebook
- nav_code = f"await navigate('{initial_url}')"
- cell = self.session.add_cell(source=nav_code)
- cell.status = ExecutionStatus.SUCCESS
- cell.execution_count = self.session.increment_execution_count()
- cell.output = f'Navigated to {initial_url}'
-
- # Get browser state after navigation for the cell
- if self.dom_service:
- try:
- browser_state_text, _ = await self._get_browser_state()
- cell.browser_state = browser_state_text
- except Exception as state_error:
- logger.debug(f'Failed to capture browser state for initial navigation cell: {state_error}')
-
- except Exception as e:
- logger.warning(f'Failed to navigate to extracted URL {initial_url}: {e}')
- # Record failed navigation as error cell
- nav_code = f"await navigate('{initial_url}')"
- cell = self.session.add_cell(source=nav_code)
- cell.status = ExecutionStatus.ERROR
- cell.execution_count = self.session.increment_execution_count()
- cell.error = str(e)
-
- # Get initial browser state before first LLM call
- if self.browser_session and self.dom_service:
- try:
- browser_state_text, screenshot = await self._get_browser_state()
- self._last_browser_state_text = browser_state_text
- self._last_screenshot = screenshot
- except Exception as e:
- logger.warning(f'Failed to get initial browser state: {e}')
-
- # Main execution loop
- for step in range(self.max_steps):
- logger.info(f'\n\n\n\n\n\n\nStep {step + 1}/{self.max_steps}')
- await self._demo_mode_log(f'Starting step {step + 1}/{self.max_steps}', 'info', {'step': step + 1})
-
- # Start timing this step
- self._step_start_time = datetime.datetime.now().timestamp()
-
- # Check if we're approaching the step limit or error limit and inject warning
- steps_remaining = self.max_steps - step - 1
- errors_remaining = self.max_failures - self._consecutive_errors
-
- should_warn = (
- steps_remaining <= 1 # Last step or next to last
- or errors_remaining <= 1 # One more error will terminate
- or (steps_remaining <= 2 and self._consecutive_errors >= 2) # Close to both limits
- )
-
- if should_warn:
- warning_message = (
- f'\n\n⚠️ CRITICAL WARNING: You are approaching execution limits!\n'
- f'- Steps remaining: {steps_remaining + 1}\n'
- f'- Consecutive errors: {self._consecutive_errors}/{self.max_failures}\n\n'
- f'YOU MUST call done() in your NEXT response, even if the task is incomplete:\n'
- f"- Set success=False if you couldn't complete the task\n"
- f'- Return EVERYTHING you found so far (partial data is better than nothing)\n'
- f"- Include any variables you've stored (products, all_data, etc.)\n"
- f"- Explain what worked and what didn't\n\n"
- f'Without done(), the user will receive NOTHING.'
- )
- self._llm_messages.append(UserMessage(content=warning_message))
-
- try:
- # Fetch fresh browser state right before LLM call (only if not already set)
- if not self._last_browser_state_text and self.browser_session and self.dom_service:
- try:
- logger.debug('🔍 Fetching browser state before LLM call...')
- browser_state_text, screenshot = await self._get_browser_state()
- self._last_browser_state_text = browser_state_text
- self._last_screenshot = screenshot
-
- # # Log browser state
- # if len(browser_state_text) > 2000:
- # logger.info(
- # f'Browser state (before LLM):\n{browser_state_text[:2000]}...\n[Truncated, full state {len(browser_state_text)} chars sent to LLM]'
- # )
- # else:
- # logger.info(f'Browser state (before LLM):\n{browser_state_text}')
- except Exception as e:
- logger.warning(f'Failed to get browser state before LLM call: {e}')
-
- # Get code from LLM (this also adds to self._llm_messages)
- try:
- code, full_llm_response = await self._get_code_from_llm(step_number=step + 1)
- except Exception as llm_error:
- # LLM call failed - count as consecutive error and retry
- self._consecutive_errors += 1
- logger.warning(
- f'LLM call failed (consecutive errors: {self._consecutive_errors}/{self.max_failures}), retrying: {llm_error}'
- )
- await self._demo_mode_log(
- f'LLM call failed: {llm_error}',
- 'error',
- {'step': step + 1},
- )
-
- # Check if we've hit the consecutive error limit
- if self._consecutive_errors >= self.max_failures:
- logger.error(f'Terminating: {self.max_failures} consecutive LLM failures')
- break
-
- await asyncio.sleep(1) # Brief pause before retry
- continue
-
- if not code or code.strip() == '':
- # If task is already done, empty code is fine (LLM explaining completion)
- if self._is_task_done():
- logger.info('Task already marked as done, LLM provided explanation without code')
- # Add the text response to history as a non-code step
- await self._add_step_to_complete_history(
- model_output_code='',
- full_llm_response=full_llm_response,
- output=full_llm_response, # Treat the explanation as output
- error=None,
- screenshot_path=await self._capture_screenshot(step + 1),
- )
- break # Exit the loop since task is done
-
- logger.warning('LLM returned empty code')
- self._consecutive_errors += 1
-
- # new state
- if self.browser_session and self.dom_service:
- try:
- browser_state_text, screenshot = await self._get_browser_state()
- self._last_browser_state_text = browser_state_text
- self._last_screenshot = screenshot
- except Exception as e:
- logger.warning(f'Failed to get new browser state: {e}')
- continue
-
- # Execute code blocks sequentially if multiple python blocks exist
- # This allows JS/bash blocks to be injected into namespace before Python code uses them
- all_blocks = self.namespace.get('_all_code_blocks', {})
- python_blocks = [k for k in sorted(all_blocks.keys()) if k.startswith('python_')]
-
- if len(python_blocks) > 1:
- # Multiple Python blocks - execute each sequentially
- output = None
- error = None
-
- for i, block_key in enumerate(python_blocks):
- logger.info(f'Executing Python block {i + 1}/{len(python_blocks)}')
- block_code = all_blocks[block_key]
- block_output, block_error, _ = await self._execute_code(block_code)
-
- # Accumulate outputs
- if block_output:
- output = (output or '') + block_output
- if block_error:
- error = block_error
- # Stop on first error
- break
- else:
- # Single Python block - execute normally
- output, error, _ = await self._execute_code(code)
-
- # Track consecutive errors
- if error:
- self._consecutive_errors += 1
- logger.warning(f'Consecutive errors: {self._consecutive_errors}/{self.max_failures}')
-
- # Check if we've hit the consecutive error limit
- if self._consecutive_errors >= self.max_failures:
- logger.error(
- f'Terminating: {self.max_failures} consecutive errors reached. The agent is unable to make progress.'
- )
- await self._demo_mode_log(
- f'Terminating after {self.max_failures} consecutive errors without progress.',
- 'error',
- {'step': step + 1},
- )
- # Add termination message to complete history before breaking
- await self._add_step_to_complete_history(
- model_output_code=code,
- full_llm_response=f'[Terminated after {self.max_failures} consecutive errors]',
- output=None,
- error=f'Auto-terminated: {self.max_failures} consecutive errors without progress',
- screenshot_path=None,
- )
- break
- else:
- # Reset consecutive error counter on success
- self._consecutive_errors = 0
-
- # Check if task is done - validate completion first if not at limits
- if self._is_task_done():
- # Get the final result from namespace (from done() call)
- final_result: str | None = self.namespace.get('_task_result') # type: ignore[assignment]
-
- # Check if we should validate (not at step/error limits and under max validations)
- steps_remaining = self.max_steps - step - 1
- should_validate = (
- self._validation_count < self.max_validations # Haven't exceeded max validations
- and steps_remaining >= 4 # At least 4 steps away from limit
- and self._consecutive_errors < 3 # Not close to error limit (8 consecutive)
- )
-
- if should_validate:
- self._validation_count += 1
- logger.info('Validating task completion with LLM...')
- from .namespace import validate_task_completion
-
- is_complete, reasoning = await validate_task_completion(
- task=self.task,
- output=final_result,
- llm=self.llm,
- )
-
- if not is_complete:
- # Task not truly complete - inject feedback and continue
- logger.warning('Validator: Task not complete, continuing...')
- validation_feedback = (
- f'\n\n⚠️ VALIDATOR FEEDBACK:\n'
- f'Your done() call was rejected. The task is NOT complete yet.\n\n'
- f'Validation reasoning:\n{reasoning}\n\n'
- f'You must continue working on the task. Analyze what is missing and complete it.\n'
- f'Do NOT call done() again until the task is truly finished.'
- )
-
- # Clear the done flag so execution continues
- self.namespace['_task_done'] = False
- self.namespace.pop('_task_result', None)
- self.namespace.pop('_task_success', None)
-
- # Add validation feedback to LLM messages
- self._llm_messages.append(UserMessage(content=validation_feedback))
-
- # Don't override output - let execution continue normally
- else:
- logger.info('Validator: Task complete')
- # Override output with done message for final step
- if final_result:
- output = final_result
- else:
- # At limits - skip validation and accept done()
- if self._validation_count >= self.max_validations:
- logger.info(
- f'Reached max validations ({self.max_validations}) - skipping validation and accepting done()'
- )
- else:
- logger.info('At step/error limits - skipping validation')
- if final_result:
- output = final_result
-
- if output:
- # Check if this is the final done() output
- if self._is_task_done():
- # Show done() output more prominently
- logger.info(
- f'✓ Task completed - Final output from done():\n{output[:300] if len(output) > 300 else output}'
- )
- # Also show files_to_display if they exist in namespace
- attachments: list[str] | None = self.namespace.get('_task_attachments') # type: ignore[assignment]
- if attachments:
- logger.info(f'Files displayed: {", ".join(attachments)}')
- else:
- logger.info(f'Code output:\n{output}')
-
- # Browser state is now only logged when fetched before LLM call (not after execution)
-
- # Take screenshot for eval tracking
- screenshot_path = await self._capture_screenshot(step + 1)
-
- # Add step to complete_history for eval system
- await self._add_step_to_complete_history(
- model_output_code=code,
- full_llm_response=full_llm_response,
- output=output,
- error=error,
- screenshot_path=screenshot_path,
- )
-
- # Check if task is done (after validation)
- if self._is_task_done():
- # Get the final result from namespace
- final_result: str | None = self.namespace.get('_task_result', output) # type: ignore[assignment]
- logger.info('Task completed successfully')
- if final_result:
- logger.info(f'Final result: {final_result}')
- self._add_sample_output_cell(final_result)
- if self._demo_mode_enabled:
- await self._demo_mode_log(
- f'Final Result: {final_result or "Task completed"}',
- 'success',
- {'tag': 'task'},
- )
- should_delay_close = True
- break
- # If validation rejected done(), continue to next iteration
- # The feedback message has already been added to _llm_messages
-
- # Add result to LLM messages for next iteration (without browser state)
- result_message = self._format_execution_result(code, output, error, current_step=step + 1)
- truncated_result = truncate_message_content(result_message)
- self._llm_messages.append(UserMessage(content=truncated_result))
-
- except Exception as e:
- logger.error(f'Error in step {step + 1}: {e}')
- traceback.print_exc()
- break
- else:
- # Loop completed without break - max_steps reached
- logger.warning(f'Maximum steps ({self.max_steps}) reached without task completion')
- await self._demo_mode_log(
- f'Maximum steps ({self.max_steps}) reached without completing the task.',
- 'error',
- {'tag': 'task'},
- )
-
- # If task is not done, capture the last step's output as partial result
- if not self._is_task_done() and self.complete_history:
- # Get the last step's output/error and use it as final extracted_content
- last_step = self.complete_history[-1]
- last_result = last_step.result[0] if last_step.result else None
- last_output = last_result.extracted_content if last_result else None
- last_error = last_result.error if last_result else None
-
- # Build a partial result message from the last step
- partial_result_parts = []
- partial_result_parts.append(f'Task incomplete - reached step limit ({self.max_steps} steps).')
- partial_result_parts.append('Last step output:')
-
- if last_output:
- partial_result_parts.append(f'\nOutput: {last_output}')
- if last_error:
- partial_result_parts.append(f'\nError: {last_error}')
-
- # Add any accumulated variables that might contain useful data
- data_vars = []
- for var_name in sorted(self.namespace.keys()):
- if not var_name.startswith('_') and var_name not in {'json', 'asyncio', 'csv', 're', 'datetime', 'Path'}:
- var_value = self.namespace[var_name]
- # Check if it's a list or dict that might contain collected data
- if isinstance(var_value, (list, dict)) and var_value:
- data_vars.append(f' - {var_name}: {type(var_value).__name__} with {len(var_value)} items')
-
- if data_vars:
- partial_result_parts.append('\nVariables in namespace that may contain partial data:')
- partial_result_parts.extend(data_vars)
-
- partial_result = '\n'.join(partial_result_parts)
-
- # Update the last step's extracted_content with this partial result
- if last_result:
- last_result.extracted_content = partial_result
- last_result.is_done = False
- last_result.success = False
-
- logger.info(f'\nPartial result captured from last step:\n{partial_result}')
- if self._demo_mode_enabled:
- await self._demo_mode_log(f'Partial result:\n{partial_result}', 'error', {'tag': 'task'})
-
- # Log final summary if task was completed
- if self._is_task_done():
- logger.info('\n' + '=' * 60)
- logger.info('TASK COMPLETED SUCCESSFULLY')
- logger.info('=' * 60)
- final_result: str | None = self.namespace.get('_task_result') # type: ignore[assignment]
- if final_result:
- logger.info(f'\nFinal Output:\n{final_result}')
- self._add_sample_output_cell(final_result)
-
- attachments: list[str] | None = self.namespace.get('_task_attachments') # type: ignore[assignment]
- if attachments:
- logger.info(f'\nFiles Attached:\n{chr(10).join(attachments)}')
- logger.info('=' * 60 + '\n')
- if self._demo_mode_enabled and not should_delay_close:
- await self._demo_mode_log(
- f'Final Result: {final_result or "Task completed"}',
- 'success',
- {'tag': 'task'},
- )
- should_delay_close = True
-
- # Auto-close browser if keep_alive is False
- if should_delay_close and self._demo_mode_enabled:
- await asyncio.sleep(30)
- await self.close()
-
- # Store usage summary for history property
- self.usage_summary = await self.token_cost_service.get_usage_summary()
-
- # Log token usage summary
- await self.token_cost_service.log_usage_summary()
-
- # Log telemetry event
- try:
- self._log_agent_event(max_steps=self.max_steps, agent_run_error=agent_run_error)
- except Exception as log_e:
- logger.error(f'Failed to log telemetry event: {log_e}', exc_info=True)
-
- # Store history data in session for history property
- self.session._complete_history = self.complete_history
- self.session._usage_summary = self.usage_summary
-
- return self.session
-
- async def _get_code_from_llm(self, step_number: int | None = None) -> tuple[str, str]:
- """Get Python code from the LLM.
-
- Returns:
- Tuple of (extracted_code, full_llm_response)
- """
- # Prepare messages for this request
- # Include browser state as separate message if available (not accumulated in history)
- messages_to_send = self._llm_messages.copy()
-
- if self._last_browser_state_text:
- # Create message with optional screenshot
- if self.use_vision and self._last_screenshot:
- # Build content with text + screenshot
- content_parts: list[ContentPartTextParam | ContentPartImageParam] = [
- ContentPartTextParam(text=self._last_browser_state_text)
- ]
-
- # Add screenshot
- content_parts.append(
- ContentPartImageParam(
- image_url=ImageURL(
- url=f'data:image/png;base64,{self._last_screenshot}',
- media_type='image/png',
- detail='auto',
- ),
- )
- )
-
- messages_to_send.append(UserMessage(content=content_parts))
- else:
- # Text only
- messages_to_send.append(UserMessage(content=self._last_browser_state_text))
-
- # Clear browser state after including it so it's only in this request
- self._last_browser_state_text = None
- self._last_screenshot = None
-
- # Call LLM with message history (including temporary browser state message)
- response = await self.llm.ainvoke(messages_to_send)
-
- # Store usage stats from this LLM call
- self._last_llm_usage = response.usage
-
- # Log the LLM's raw output for debugging
- logger.info(f'LLM Response:\n{response.completion}')
- await self._demo_mode_log(
- f'LLM Response:\n{response.completion}',
- 'thought',
- {'step': step_number} if step_number else None,
- )
-
- # Check for token limit or repetition issues
- max_tokens = getattr(self.llm, 'max_tokens', None)
- completion_tokens = response.usage.completion_tokens if response.usage else None
- is_problematic, issue_message = detect_token_limit_issue(
- completion=response.completion,
- completion_tokens=completion_tokens,
- max_tokens=max_tokens,
- stop_reason=response.stop_reason,
- )
-
- if is_problematic:
- logger.warning(f'Token limit issue detected: {issue_message}')
- # Don't add the bad response to history
- # Instead, inject a system message prompting recovery
- recovery_prompt = (
- f'Your previous response hit a token limit or became repetitive: {issue_message}\n\n'
- 'Please write a SHORT plan (2 sentences) for what to do next, then execute ONE simple action.'
- )
- self._llm_messages.append(UserMessage(content=recovery_prompt))
- # Return a controlled error message instead of corrupted code
- return '', f'[Token limit error: {issue_message}]'
-
- # Store the full response
- full_response = response.completion
-
- # Extract code blocks from response
- # Support multiple code block types: python, js, bash, markdown
- code_blocks = extract_code_blocks(response.completion)
-
- # Inject non-python blocks into namespace as variables
- # Track which variables are code blocks for browser state display
- if '_code_block_vars' not in self.namespace:
- self.namespace['_code_block_vars'] = set()
-
- for block_type, block_content in code_blocks.items():
- if not block_type.startswith('python'):
- # Store js, bash, markdown blocks (and named variants) as variables in namespace
- self.namespace[block_type] = block_content
- self.namespace['_code_block_vars'].add(block_type)
- print(f'→ Code block variable: {block_type} (str, {len(block_content)} chars)')
- logger.debug(f'Injected {block_type} block into namespace ({len(block_content)} chars)')
-
- # Store all code blocks for sequential execution
- self.namespace['_all_code_blocks'] = code_blocks
-
- # Get Python code if it exists
- # If no python block exists and no other code blocks exist, return empty string to skip execution
- # This prevents treating plain text explanations as code
- code = code_blocks.get('python', response.completion)
-
- # Add to LLM messages (truncate for history to save context)
- truncated_completion = truncate_message_content(response.completion)
- self._llm_messages.append(AssistantMessage(content=truncated_completion))
-
- return code, full_response
-
- def _print_variable_info(self, var_name: str, value: Any) -> None:
- """Print compact info about a variable assignment."""
- # Skip built-in modules and known imports
- skip_names = {
- 'json',
- 'asyncio',
- 'csv',
- 're',
- 'datetime',
- 'Path',
- 'pd',
- 'np',
- 'plt',
- 'requests',
- 'BeautifulSoup',
- 'PdfReader',
- 'browser',
- 'file_system',
- }
- if var_name in skip_names:
- return
-
- # Skip code block variables (already printed)
- if '_code_block_vars' in self.namespace and var_name in self.namespace.get('_code_block_vars', set()):
- return
-
- # Print compact variable info
- if isinstance(value, (list, dict)):
- preview = str(value)[:100]
- print(f'→ Variable: {var_name} ({type(value).__name__}, len={len(value)}, preview={preview}...)')
- elif isinstance(value, str) and len(value) > 50:
- print(f'→ Variable: {var_name} (str, {len(value)} chars, preview={value[:50]}...)')
- elif callable(value):
- print(f'→ Variable: {var_name} (function)')
- else:
- print(f'→ Variable: {var_name} ({type(value).__name__}, value={repr(value)[:50]})')
-
- async def _execute_code(self, code: str) -> tuple[str | None, str | None, str | None]:
- """
- Execute Python code in the namespace.
-
- Args:
- code: The Python code to execute
-
- Returns:
- Tuple of (output, error, browser_state)
- """
- # Create new cell
- cell = self.session.add_cell(source=code)
- cell.status = ExecutionStatus.RUNNING
- cell.execution_count = self.session.increment_execution_count()
-
- output = None
- error = None
- browser_state = None
-
- try:
- # Capture output
- import ast
- import io
- import sys
-
- old_stdout = sys.stdout
- sys.stdout = io.StringIO()
-
- try:
- # Add asyncio to namespace if not already there
- if 'asyncio' not in self.namespace:
- self.namespace['asyncio'] = asyncio
-
- # Store the current code in namespace for done() validation
- self.namespace['_current_cell_code'] = code
- # Store consecutive errors count for done() validation
- self.namespace['_consecutive_errors'] = self._consecutive_errors
-
- # Check if code contains await expressions - if so, wrap in async function
- # This mimics how Jupyter/IPython handles top-level await
- try:
- tree = ast.parse(code, mode='exec')
- has_await = any(isinstance(node, (ast.Await, ast.AsyncWith, ast.AsyncFor)) for node in ast.walk(tree))
- except SyntaxError:
- # If parse fails, let exec handle the error
- has_await = False
-
- if has_await:
- # When code has await, we must wrap in async function
- # To make variables persist naturally (like Jupyter without needing 'global'):
- # 1. Extract all assigned variable names from the code
- # 2. Inject 'global' declarations for variables that already exist in namespace
- # 3. Extract user's explicit global declarations and pre-define those vars
- # 4. Return locals() so we can update namespace with new variables
-
- # Find all variable names being assigned + user's explicit globals
- try:
- assigned_names = set()
- user_global_names = set()
-
- for node in ast.walk(tree):
- if isinstance(node, ast.Assign):
- for target in node.targets:
- if isinstance(target, ast.Name):
- assigned_names.add(target.id)
- elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
- assigned_names.add(node.target.id)
- elif isinstance(node, (ast.AnnAssign, ast.NamedExpr)):
- if hasattr(node, 'target') and isinstance(node.target, ast.Name):
- assigned_names.add(node.target.id)
- elif isinstance(node, ast.Global):
- # Track user's explicit global declarations
- user_global_names.update(node.names)
-
- # Pre-define any user-declared globals that don't exist yet
- # This prevents NameError when user writes "global foo" before "foo = ..."
- for name in user_global_names:
- if name not in self.namespace:
- self.namespace[name] = None
-
- # Filter to only existing namespace vars (like Jupyter does)
- # Include both: assigned vars that exist + user's explicit globals
- existing_vars = {name for name in (assigned_names | user_global_names) if name in self.namespace}
- except Exception as e:
- existing_vars = set()
-
- # Build global declaration if needed
- global_decl = ''
- has_global_decl = False
- if existing_vars:
- vars_str = ', '.join(sorted(existing_vars))
- global_decl = f' global {vars_str}\n'
- has_global_decl = True
-
- indented_code = '\n'.join(' ' + line if line.strip() else line for line in code.split('\n'))
- wrapped_code = f"""async def __code_exec__():
-{global_decl}{indented_code}
- # Return locals so we can update the namespace
- return locals()
-
-__code_exec_coro__ = __code_exec__()
-"""
- # Store whether we added a global declaration (needed for error line mapping)
- self.namespace['_has_global_decl'] = has_global_decl
-
- # Compile and execute wrapper at module level
- compiled_code = compile(wrapped_code, '', 'exec')
- exec(compiled_code, self.namespace, self.namespace)
-
- # Get and await the coroutine, then update namespace with new/modified variables
- coro = self.namespace.get('__code_exec_coro__')
- if coro:
- result_locals = await coro
- # Update namespace with all variables from the function's locals
- # This makes variable assignments persist across cells
- if result_locals:
- for key, value in result_locals.items():
- if not key.startswith('_'):
- self.namespace[key] = value
- # Variable info is tracked in "Available" section, no need for verbose inline output
-
- # Clean up temporary variables
- self.namespace.pop('__code_exec_coro__', None)
- self.namespace.pop('__code_exec__', None)
- else:
- # No await - execute directly at module level for natural variable scoping
- # This means x = x + 10 will work without needing 'global x'
-
- # Track variables before execution
- vars_before = set(self.namespace.keys())
-
- compiled_code = compile(code, '', 'exec')
- exec(compiled_code, self.namespace, self.namespace)
-
- # Track newly created/modified variables (info shown in "Available" section)
- vars_after = set(self.namespace.keys())
- new_vars = vars_after - vars_before
-
- # Get output
- output_value = sys.stdout.getvalue()
- if output_value:
- output = output_value
-
- finally:
- sys.stdout = old_stdout
-
- # Wait 2 seconds for page to stabilize after code execution
- await asyncio.sleep(0.5)
-
- # Note: Browser state is now fetched right before LLM call instead of after each execution
- # This reduces unnecessary state fetches for operations that don't affect the browser
-
- cell.status = ExecutionStatus.SUCCESS
- cell.output = output
- cell.browser_state = None # Will be captured in next iteration before LLM call
-
- except Exception as e:
- # Handle EvaluateError specially - JavaScript execution failed
- if isinstance(e, EvaluateError):
- error = str(e)
- cell.status = ExecutionStatus.ERROR
- cell.error = error
- logger.error(f'Code execution error: {error}')
-
- await asyncio.sleep(1)
-
- # Browser state will be fetched before next LLM call
- # Return immediately - do not continue executing code
- return output, error, None
-
- # Handle NameError specially - check for code block variable confusion
- if isinstance(e, NameError):
- error_msg = str(e)
- cell.status = ExecutionStatus.ERROR
- cell.error = error
-
- # Browser state will be fetched before next LLM call
- await asyncio.sleep(0.5)
- return output, error, None
-
- # For syntax errors and common parsing errors, show just the error message
- # without the full traceback to keep output clean
- if isinstance(e, SyntaxError):
- error_msg = e.msg if e.msg else str(e)
- error = f'{type(e).__name__}: {error_msg}'
-
- # Detect common f-string issues with JSON/JavaScript code
- if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower() and code:
- # Check if code contains f-strings with potential JSON/JS content
- has_fstring = bool(re.search(r'\bf["\']', code))
- has_json_pattern = bool(re.search(r'json\.dumps|"[^"]*\{[^"]*\}[^"]*"|\'[^\']*\{[^\']*\}[^\']*\'', code))
- has_js_pattern = bool(re.search(r'evaluate\(|await evaluate', code))
-
- if has_fstring and (has_json_pattern or has_js_pattern):
- error += (
- '\n\n💡 TIP: Detected f-string with JSON/JavaScript code containing {}.\n'
- ' Use separate ```js or ```markdown blocks instead of f-strings to avoid escaping issues.\n'
- ' If your code block needs ``` inside it, wrap with 4+ backticks: ````markdown code`\n'
- )
-
- # Detect and provide helpful hints for common string literal errors
- if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower():
- # Detect what type of string literal is unterminated
- is_triple = 'triple-quoted' in error_msg.lower()
- msg_lower = error_msg.lower()
-
- # Detect prefix type from error message
- if 'f-string' in msg_lower and 'raw' in msg_lower:
- prefix = 'rf or fr'
- desc = 'raw f-string'
- elif 'f-string' in msg_lower:
- prefix = 'f'
- desc = 'f-string'
- elif 'raw' in msg_lower and 'bytes' in msg_lower:
- prefix = 'rb or br'
- desc = 'raw bytes'
- elif 'raw' in msg_lower:
- prefix = 'r'
- desc = 'raw string'
- elif 'bytes' in msg_lower:
- prefix = 'b'
- desc = 'bytes'
- else:
- prefix = ''
- desc = 'string'
-
- # Build hint based on triple-quoted vs single/double quoted
- if is_triple:
- if prefix:
- hint = f"Hint: Unterminated {prefix}'''...''' or {prefix}\"\"\"...\"\" ({desc}). Check for missing closing quotes or unescaped quotes inside."
- else:
- hint = "Hint: Unterminated '''...''' or \"\"\"...\"\" detected. Check for missing closing quotes or unescaped quotes inside."
- hint += '\n If you need ``` inside your string, use a ````markdown varname` code block with 4+ backticks instead.'
- else:
- if prefix:
- hint = f'Hint: Unterminated {prefix}\'...\' or {prefix}"..." ({desc}). Check for missing closing quote or unescaped quotes inside.'
- else:
- hint = 'Hint: Unterminated \'...\' or "..." detected. Check for missing closing quote or unescaped quotes inside the string.'
- error += f'\n{hint}'
-
- # Show the problematic line from the code
- if e.text:
- error += f'\n{e.text}'
- elif e.lineno and code:
- # If e.text is empty, extract the line from the code
- lines = code.split('\n')
- if 0 < e.lineno <= len(lines):
- error += f'\n{lines[e.lineno - 1]}'
-
- else:
- # For other errors, try to extract useful information
- error_str = str(e)
- error = f'{type(e).__name__}: {error_str}' if error_str else f'{type(e).__name__} occurred'
-
- # For RuntimeError or other exceptions, try to extract traceback info
- # to show which line in the user's code actually failed
- if hasattr(e, '__traceback__'):
- # Walk the traceback to find the frame with '' filename
- tb = e.__traceback__
- user_code_lineno = None
- while tb is not None:
- frame = tb.tb_frame
- if frame.f_code.co_filename == '':
- # Found the frame executing user code
- # Get the line number from the traceback
- user_code_lineno = tb.tb_lineno
- break
- tb = tb.tb_next
-
- cell.status = ExecutionStatus.ERROR
- cell.error = error
- logger.error(f'Code execution error: {error}')
-
- await asyncio.sleep(1)
-
- # Browser state will be fetched before next LLM call
-
- return output, error, None
-
- async def _get_browser_state(self) -> tuple[str, str | None]:
- """Get the current browser state as text with ultra-minimal DOM structure for code agents.
-
- Returns:
- Tuple of (browser_state_text, screenshot_base64)
- """
- if not self.browser_session or not self.dom_service:
- return 'Browser state not available', None
-
- try:
- # Get full browser state including screenshot if use_vision is enabled
- include_screenshot = True
- state = await self.browser_session.get_browser_state_summary(include_screenshot=include_screenshot)
-
- # Format browser state with namespace context
- browser_state_text = await format_browser_state_for_llm(
- state=state, namespace=self.namespace, browser_session=self.browser_session
- )
-
- screenshot = state.screenshot if include_screenshot else None
- return browser_state_text, screenshot
-
- except Exception as e:
- logger.error(f'Failed to get browser state: {e}')
- return f'Error getting browser state: {e}', None
-
- def _format_execution_result(self, code: str, output: str | None, error: str | None, current_step: int | None = None) -> str:
- """Format the execution result for the LLM (without browser state)."""
- result = []
-
- # Add step progress header if step number provided
- if current_step is not None:
- progress_header = f'Step {current_step}/{self.max_steps} executed'
- # Add consecutive failure tracking if there are errors
- if error and self._consecutive_errors > 0:
- progress_header += f' | Consecutive failures: {self._consecutive_errors}/{self.max_failures}'
- result.append(progress_header)
-
- if error:
- result.append(f'Error: {error}')
-
- if output:
- # Truncate output if too long
- if len(output) > 10000:
- output = output[:9950] + '\n[Truncated after 10000 characters]'
- result.append(f'Output: {output}')
- if len(result) == 0:
- result.append('Executed')
- return '\n'.join(result)
-
- def _is_task_done(self) -> bool:
- """Check if the task is marked as done in the namespace."""
- # Check if 'done' was called by looking for a special marker in namespace
- return self.namespace.get('_task_done', False)
-
- async def _capture_screenshot(self, step_number: int) -> str | None:
- """Capture and store screenshot for eval tracking."""
- if not self.browser_session:
- return None
-
- try:
- # Get browser state summary which includes screenshot
- state = await self.browser_session.get_browser_state_summary(include_screenshot=True)
- if state and state.screenshot:
- # Store screenshot using screenshot service
- screenshot_path = await self.screenshot_service.store_screenshot(state.screenshot, step_number)
- return str(screenshot_path) if screenshot_path else None
- except Exception as e:
- logger.warning(f'Failed to capture screenshot for step {step_number}: {e}')
- return None
-
- async def _add_step_to_complete_history(
- self,
- model_output_code: str,
- full_llm_response: str,
- output: str | None,
- error: str | None,
- screenshot_path: str | None,
- ) -> None:
- """Add a step to complete_history using type-safe models."""
- # Get current browser URL and title for state
- url: str | None = None
- title: str | None = None
- if self.browser_session:
- try:
- url = await self.browser_session.get_current_page_url()
- # Get title from browser
- cdp_session = await self.browser_session.get_or_create_cdp_session()
- result = await cdp_session.cdp_client.send.Runtime.evaluate(
- params={'expression': 'document.title', 'returnByValue': True},
- session_id=cdp_session.session_id,
- )
- title = result.get('result', {}).get('value')
- except Exception as e:
- logger.debug(f'Failed to get browser URL/title for history: {e}')
-
- # Check if this is a done result
- is_done = self._is_task_done()
-
- # Get self-reported success from done() call if task is done
- self_reported_success: bool | None = None
- if is_done:
- task_success = self.namespace.get('_task_success')
- self_reported_success = task_success if isinstance(task_success, bool) else None
-
- # Create result entry using typed model
- result_entry = CodeAgentResult(
- extracted_content=output if output else None,
- error=error if error else None,
- is_done=is_done,
- success=self_reported_success,
- )
-
- # Create state entry using typed model
- state_entry = CodeAgentState(url=url, title=title, screenshot_path=screenshot_path)
-
- # Create metadata entry using typed model
- step_end_time = datetime.datetime.now().timestamp()
- metadata_entry = CodeAgentStepMetadata(
- input_tokens=self._last_llm_usage.prompt_tokens if self._last_llm_usage else None,
- output_tokens=self._last_llm_usage.completion_tokens if self._last_llm_usage else None,
- step_start_time=self._step_start_time,
- step_end_time=step_end_time,
- )
-
- # Create model output entry using typed model (if there's code to track)
- model_output_entry: CodeAgentModelOutput | None = None
- if model_output_code or full_llm_response:
- model_output_entry = CodeAgentModelOutput(
- model_output=model_output_code if model_output_code else '',
- full_response=full_llm_response if full_llm_response else '',
- )
-
- # Create history entry using typed model
- history_entry = CodeAgentHistory(
- model_output=model_output_entry,
- result=[result_entry],
- state=state_entry,
- metadata=metadata_entry,
- screenshot_path=screenshot_path, # Keep for backward compatibility
- )
-
- self.complete_history.append(history_entry)
- await self._demo_mode_log_step(history_entry)
-
- async def _demo_mode_log(self, message: str, level: str = 'info', metadata: dict[str, Any] | None = None) -> None:
- if not (self._demo_mode_enabled and message and self.browser_session):
- return
- try:
- await self.browser_session.send_demo_mode_log(
- message=message,
- level=level,
- metadata=metadata or {},
- )
- except Exception as exc:
- logger.debug(f'[DemoMode] Failed to send log: {exc}')
-
- async def _demo_mode_log_step(self, history_entry: CodeAgentHistory) -> None:
- if not self._demo_mode_enabled:
- return
- step_number = len(self.complete_history)
- result = history_entry.result[0] if history_entry.result else None
- if not result:
- return
- level = 'error' if result.error else 'success' if result.success else 'info'
- message_parts = [f'Step {step_number}:']
- if result.error:
- message_parts.append(f'Error: {result.error}')
- if result.extracted_content:
- message_parts.append(result.extracted_content)
- elif result.success:
- message_parts.append('Marked done.')
- else:
- message_parts.append('Executed.')
- await self._demo_mode_log(
- ' '.join(message_parts).strip(),
- level,
- {'step': step_number, 'url': history_entry.state.url if history_entry.state else None},
- )
-
- def _add_sample_output_cell(self, final_result: Any | None) -> None:
- if self._sample_output_added or final_result is None:
- return
-
- sample_content: str | None = None
-
- def _extract_sample(data: Any) -> Any | None:
- if isinstance(data, list) and data:
- return data[0]
- if isinstance(data, dict) and data:
- first_key = next(iter(data))
- return {first_key: data[first_key]}
- return data if isinstance(data, (str, int, float, bool)) else None
-
- data: Any | None = None
- if isinstance(final_result, str):
- try:
- data = json.loads(final_result)
- except Exception:
- sample_content = final_result.strip()
- elif isinstance(final_result, (list, dict)):
- data = final_result
-
- if data is not None:
- sample = _extract_sample(data)
- if isinstance(sample, (dict, list)):
- try:
- sample_content = json.dumps(sample, indent=2, ensure_ascii=False)
- except Exception:
- sample_content = str(sample)
- elif sample is not None:
- sample_content = str(sample)
-
- if not sample_content:
- return
-
- sample_cell = self.session.add_cell(source='# Sample output preview')
- sample_cell.cell_type = CellType.MARKDOWN
- sample_cell.status = ExecutionStatus.SUCCESS
- sample_cell.execution_count = None
- escaped = html.escape(sample_content)
- sample_cell.output = f'{escaped} '
-
- self._sample_output_added = True
-
- def _log_agent_event(self, max_steps: int, agent_run_error: str | None = None) -> None:
- """Send the agent event for this run to telemetry."""
- from urllib.parse import urlparse
-
- token_summary = self.token_cost_service.get_usage_tokens_for_model(self.llm.model)
-
- # For CodeAgent, we don't have action history like Agent does
- # Instead we track the code execution cells
- action_history_data: list[list[dict[str, Any]] | None] = []
- for step in self.complete_history:
- # Extract code from model_output if available (type-safe access)
- if step.model_output and step.model_output.full_response:
- code = step.model_output.full_response
- # Represent each code cell as a simple action entry
- action_history_data.append([{'llm_response': code}])
- else:
- action_history_data.append(None)
-
- # Get final result from the last step or namespace (type-safe)
- final_result: Any = self.namespace.get('_task_result')
- final_result_str: str | None = final_result if isinstance(final_result, str) else None
-
- # Get URLs visited from complete_history (type-safe access)
- urls_visited: list[str] = []
- for step in self.complete_history:
- if step.state.url and step.state.url not in urls_visited:
- urls_visited.append(step.state.url)
-
- # Get errors from complete_history (type-safe access)
- errors: list[str] = []
- for step in self.complete_history:
- for result in step.result:
- if result.error:
- errors.append(result.error)
-
- # Determine success from task completion status (type-safe)
- is_done = self._is_task_done()
- task_success: Any = self.namespace.get('_task_success')
- self_reported_success: bool | None = task_success if isinstance(task_success, bool) else (False if is_done else None)
-
- self.telemetry.capture(
- AgentTelemetryEvent(
- task=self.task,
- model=self.llm.model,
- model_provider=self.llm.provider,
- max_steps=max_steps,
- max_actions_per_step=1, # CodeAgent executes one code cell per step
- use_vision=self.use_vision,
- version=self.version,
- source=self.source,
- cdp_url=urlparse(self.browser_session.cdp_url).hostname
- if self.browser_session and self.browser_session.cdp_url
- else None,
- agent_type='code', # CodeAgent identifier
- action_errors=errors,
- action_history=action_history_data,
- urls_visited=urls_visited,
- steps=len(self.complete_history),
- total_input_tokens=token_summary.prompt_tokens,
- total_output_tokens=token_summary.completion_tokens,
- prompt_cached_tokens=token_summary.prompt_cached_tokens,
- total_tokens=token_summary.total_tokens,
- total_duration_seconds=sum(step.metadata.duration_seconds for step in self.complete_history if step.metadata),
- success=self_reported_success,
- final_result_response=final_result_str,
- error_message=agent_run_error,
- )
- )
-
- def screenshot_paths(self, n_last: int | None = None) -> list[str | None]:
- """
- Get screenshot paths from complete_history for eval system.
-
- Args:
- n_last: Optional number of last screenshots to return
-
- Returns:
- List of screenshot file paths (or None for missing screenshots)
- """
- paths = [step.screenshot_path for step in self.complete_history]
-
- if n_last is not None:
- return paths[-n_last:] if len(paths) > n_last else paths
-
- return paths
-
- @property
- def message_manager(self) -> Any:
- """
- Compatibility property for eval system.
- Returns a mock object with last_input_messages attribute.
- """
-
- class MockMessageManager:
- def __init__(self, llm_messages: list[BaseMessage]) -> None:
- # Convert code-use LLM messages to format expected by eval system
- self.last_input_messages = llm_messages
-
- return MockMessageManager(self._llm_messages)
-
- @property
- def history(self) -> CodeAgentHistoryList:
- """
- Compatibility property for eval system.
- Returns a CodeAgentHistoryList object with history attribute containing complete_history.
- This is what the eval system expects when it does: agent_history = agent.history
- """
- return CodeAgentHistoryList(self.complete_history, self.usage_summary)
-
- async def close(self) -> None:
- """Close the browser session."""
- if self.browser_session:
- # Check if we should close the browser based on keep_alive setting
- if not self.browser_session.browser_profile.keep_alive:
- await self.browser_session.kill()
- else:
- logger.debug('Browser keep_alive is True, not closing browser session')
-
- async def __aenter__(self) -> 'CodeAgent':
- """Async context manager entry."""
- return self
-
- async def __aexit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: Any) -> None:
- """Async context manager exit."""
- await self.close()
diff --git a/browser_use/code_use/system_prompt.md b/browser_use/code_use/system_prompt.md
deleted file mode 100644
index c1cd34f25..000000000
--- a/browser_use/code_use/system_prompt.md
+++ /dev/null
@@ -1,574 +0,0 @@
-# Coding Browser Agent - System Prompt
-
-You are created by browser-use for complex automated browser tasks.
-
-## Core Concept
-You execute Python code in a notebook like environment to control a browser and complete tasks.
-
-**Mental Model**: Write one code cell per step → Gets automatically executed → **you receive the new output + * in the next response you write the next code cell → Repeat.
-
-
----
-
-## INPUT: What You See
-
-### Browser State Format
-- **URL & DOM**: Compressed DOM tree with interactive elements marked as `[i_123]`
-- **Loading Status**: Network requests currently pending (automatically filtered for ads/tracking)
- - Shows URL, loading duration, and resource type for each pending request
-
-- **Element Markers**:
- - `[i_123]` - Interactive elements (buttons, inputs, links)
- - `|SHADOW(open/closed)|` - Shadow DOM boundaries (content auto-included)
- - `|IFRAME|` or `|FRAME|` - Iframe boundaries (content auto-included)
- - `|scroll element|` - Scrollable containers
-
-### Execution Environment
-- **Variables persist** across steps (like Jupyter) - NEVER use `global` keyword - thats not needed we do the injection for you.
-- **Multiple code blocks in ONE response are COMBINED** - earlier blocks' variables available in later blocks
-- **8 consecutive errors = auto-termination**
-
-### Multi-Block Code Support
-Non-Python blocks are saved as string variables:
-- ````js extract_products` → saved to `extract_products` variable (named blocks)
-- ````markdown result_summary` → saved to `result_summary` variable
-- ````bash bash_code` → saved to `bash_code` variable
-
-Variable name matches exactly what you write after language name!
-
-**Nested Code Blocks**: If your code contains ``` inside it (e.g., markdown with code blocks), use 4+ backticks:
-- `````markdown fix_code` with ``` inside → use 4 backticks to wrap
-- ``````python complex_code` with ```` inside → use 5+ backticks to wrap
-
----
-
-## OUTPUT: How You Respond
-
-### Response Format - Cell-by-Cell Execution
-
-**This is a Jupyter-like notebook environment**: Execute ONE code cell → See output + browser state → Execute next cell.
-
-[1 short sentence about previous step code result and new DOM]
-[1 short sentence about next step]
-
-```python
-# 1 cell of code here that will be executed
-print(results)
-```
-Stop generating and inspect the output before continuing.
-
-
-
-
-## TOOLS: Available Functions
-
-### 1. Navigation
-```python
-await navigate('https://example.com')
-await asyncio.sleep(1)
-```
-- **Auto-wait**: System automatically waits 1s if network requests are pending before showing you the state
-- Loaded fully? Check URL/DOM and **⏳ Loading** status in next browser state
-- If you see pending network requests in the state, consider waiting longer: `await asyncio.sleep(2)`
-- In your next browser state after navigation analyse the screenshot: Is data still loading? Do you expect more data? → Wait longer with.
-- All previous indices [i_index] become invalid after navigation
-
-**After navigate(), dismiss overlays**:
-```js dismiss_overlays
-(function(){
- const dismissed = [];
- ['button[id*="accept"]', '[class*="cookie"] button'].forEach(sel => {
- document.querySelectorAll(sel).forEach(btn => {
- if (btn.offsetParent !== null) {
- btn.click();
- dismissed.push('cookie');
- }
- });
- });
- document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape', keyCode: 27}));
- return dismissed.length > 0 ? dismissed : null;
-})()
-```
-
-```python
-dismissed = await evaluate(dismiss_overlays)
-if dismissed:
- print(f"OK Dismissed: {dismissed}")
-```
-
-For web search use duckduckgo.com by default to avoid CAPTCHAS.
-If direct navigation is blocked by CAPTCHA or challenge that cannot be solved after one try, pivot to alternative methods: try alternative URLs for the same content, third-party aggregators (user intent has highest priority).
-
-### 2. Interactive Elements
-The index is the label inside your browser state [i_index] inside the element you want to interact with. Only use indices from the current state. After page changes these become invalid.
-```python
-await click(index=456) # accepts only index integer from browser state
-await input_text(index=456, text="hello", clear=True) # Clear False to append text
-await upload_file(index=789, path="/path/to/file.pdf")
-await dropdown_options(index=123)
-await select_dropdown(index=123, text="CA") # Text can be the element text or value.
-await scroll(down=True, pages=1.0, index=None) # Down=False to scroll up. Pages=10.0 to scroll 10 pages. Use Index to scroll in the container of this element.
-await send_keys(keys="Enter") # Use e.g. for Escape, Arrow keys, Page Up, Page Down, Home, End, etc.
-await switch(tab_id="a1b2") # Switch to a 4 character tab by id from the browser state.
-await close(tab_id="a1b2") # Close a tab by id from the browser state.
-await go_back() # Navigate back in the browser history.
-```
-
-Indices Work Only once. After page changes (click, navigation, DOM update), ALL indices `[i_*]` become invalid and must be re-queried.
-
-Do not do:
-```python
-link_indices = [456, 457, 458]
-for idx in link_indices:
- await click(index=idx) # FAILS - indices stale after first click
-```
-
-RIGHT - Option 1 (Extract URLs first):
-```python
-links = await evaluate('(function(){ return Array.from(document.querySelectorAll("a.product")).map(a => a.href); })()')
-for url in links:
- await navigate(url)
- # extract data
- await go_back()
-```
-
-
-### 3. get_selector_from_index(index: int) → str
-Get stable CSS selector for element with index `[i_456]`:
-
-```python
-import json
-selector = await get_selector_from_index(index=456)
-print(f"OK Selector: {selector}") # Always print for debugging!
-el_text = await evaluate(f'(function(){{ return document.querySelector({json.dumps(selector)}).textContent; }})()')
-```
-
-**When to use**:
-- Clicking same element type repeatedly (e.g., "Next" button in pagination)
-- Loops where DOM changes between iterations
-
-### 4. evaluate(js: str, variables: dict = None) → Python data
-Execute JavaScript, returns dict/list/str/number/bool/None.
-
-**ALWAYS use ```js blocks for anything beyond one-liners**:
-
-```js extract_products
-(function(){
- return Array.from(document.querySelectorAll('.product')).map(p => ({
- name: p.querySelector('.name')?.textContent,
- price: p.querySelector('.price')?.textContent
- }));
-})()
-```
-
-```python
-products = await evaluate(extract_products)
-print(f"Found {len(products)} products")
-```
-
-**Passing Python variables to JavaScript**:
-```js extract_data
-(function(params) {
- const maxItems = params.max_items || 100;
- return Array.from(document.querySelectorAll('.item'))
- .slice(0, maxItems)
- .map(item => ({name: item.textContent}));
-})
-```
-
-```python
-result = await evaluate(extract_data, variables={'max_items': 50})
-```
-
-**Key rules**:
-- Wrap in IIFE: `(function(){ ... })()`
-- For variables: use `(function(params){ ... })` without final `()`
-- NO JavaScript comments (`//` or `/* */`)
-- NO backticks (\`) inside code blocks
-- Use standard JS (NO jQuery)
-- Do optional checks - and print the results to help you debug.
-- Avoid complex queries where possible. Do all data processing in python.
-- Avoid syntax errors. For more complex data use json.dumps(data).
-
-### 5. done() - MANDATORY FINAL STEP
-Final Output with done(text:str, success:bool, files_to_display:list[str] = [])
-
-```python
-summary = "Successfully extracted 600 items on 40 pages and saved them to the results.json file."
-await done(
- text=summary,
- success=True,
- files_to_display=['results.json', 'data.csv']
-)
-```
-
-**Rules**:
-1. `done()` must be the ONLY statement in this cell/response. In the steps before you must verify the final result.
-3. For structured data/code: write to files, use `files_to_display`
-4. For short tasks (<5 lines output): print directly in `done(text=...)`, skip file creation
-5. NEVER embed JSON/code blocks in markdown templates (breaks `.format()`). Instead use json.dumps(data) or + to concatenate strings.
-6. Set `success=False` if task impossible after many many different attempts
-
-
----
-
-## HINTS: Common Patterns & Pitfalls
-
-### JavaScript Search > Scrolling
-Before scrolling 2+ times, use JS to search entire document:
-
-```js search_document
-(function(){
- const fullText = document.body.innerText;
- return {
- found: fullText.includes('Balance Sheet'),
- sampleText: fullText.substring(0, 200)
- };
-})()
-```
-
-### Verify Search Results Loaded
-After search submission, ALWAYS verify results exist:
-
-```js verify_search_results
-(function(){
- return document.querySelectorAll("[class*=\\"result\\"]").length;
-})()
-```
-
-```python
-await input_text(index=SEARCH_INPUT, text="query", clear=True)
-await send_keys(keys="Enter")
-await asyncio.sleep(1)
-
-result_count = await evaluate(verify_search_results)
-if result_count == 0:
- print("Search failed, trying alternative")
- await navigate(f"https://site.com/search?q={query.replace(' ', '+')}")
-else:
- print(f"Search returned {result_count} results")
-```
-
-### Handle Dynamic/Obfuscated Classes
-Modern sites use hashed classes (`_30jeq3`). After 2 failures, switch strategy:
-In the exploration phase you can combine multiple in parallel with error handling to find the best approach quickly..
-
-**Strategy 1**: Extract by structure/position
-```js extract_products_by_structure
-(function(){
- return Array.from(document.querySelectorAll('.product')).map(p => {
- const link = p.querySelector('a[href*="/product/"]');
- const priceContainer = p.querySelector('div:nth-child(3)');
- return {
- name: link?.textContent,
- priceText: priceContainer?.textContent
- };
- });
-})()
-```
-
-**Strategy 2**: Extract all text, parse in Python with regex
-```python
-items = await evaluate(extract_products_by_structure)
-import re
-for item in items:
- prices = re.findall(r'[$₹€][\d,]+', item['priceText'])
- item['price'] = prices[0] if prices else None
-```
-
-**Strategy 3**: Debug by printing structure
-```js print_structure
-(function(){
- const el = document.querySelector('.product');
- return {
- html: el?.outerHTML.substring(0, 500),
- classes: Array.from(el?.querySelectorAll('*') || [])
- .map(e => e.className)
- .filter(c => c.includes('price'))
- };
-})()
-```
-
-### Pagination: Try URL First
-**Priority order**:
-1. **Try URL parameters** (1 attempt): `?page=2`, `?p=2`, `?offset=20`, `/page/2/`
-2. **If URL fails, search & click the next page button**
-
-### Pre-Extraction Checklist
-First verify page is loaded and you set the filters/settings correctly:
-
-```js product_count
-(function(){
- return document.querySelectorAll(".product").length;
-})()
-```
-
-```python
-print("=== Applying filters ===")
-await select_dropdown(index=789, text="Under $100")
-await click(index=567) # Apply button
-print("OK Filters applied")
-
-filtered_count = await evaluate(product_count)
-print(f"OK Page loaded with {filtered_count} products")
-```
----
-
-## STRATEGY: Execution Flow
-
-### Phase 1: Exploration
-- Navigate to target URL
-- Dismiss overlays (cookies, modals)
-- Apply all filters/settings BEFORE extraction
-- Use JavaScript to search entire document for target content
-- Explore DOM structure with various small test extractions in parallel with error handling
-- Use try/except and null checks
-- Print sub-information to validate approach
-
-### Phase 2: Validation (Execute Cell-by-Cell!)
-- Write general extraction function
-- Test on small subset (1-5 items) with error handling
-- Verify data structure in Python
-- Check for missing/null fields
-- Print sample data
-- If extraction fails 2x, switch strategy
-
-### Phase 3: Batch Processing
-- Once strategy validated, increase batch size
-- Loop with explicit counters
-- Save incrementally to avoid data loss
-- Handle pagination (URL first, then buttons)
-- Track progress: `print(f"Page {i}: {len(items)} items. Total: {len(all_data)}")`
-- Check if it works and then increase the batch size.
-
-### Phase 4: Cleanup & Verification
-- Verify all required data collected
-- Filter duplicates
-- Missing fields / Data? -> change strategy and keep going.
-- Format/clean data in Python (NOT JavaScript)
-- Write to files (JSON/CSV)
-- Print final stats, but not all the data to avoid overwhelming the context.
-- Inspect the output and reason if this is exactly the user intent or if the user wants more.
-
-### Phase 5: Done
-- Verify task completion
-- Call `done()` with summary + `files_to_display`
-
----
-
-## EXAMPLE: Complete Flow
-
-**Task**: Extract products from paginated e-commerce site, save to JSON
-
-### Step 1: Navigate + Dismiss Overlays
-
-```js page_loaded
-(function(){
- return document.readyState === 'complete';
-})()
-```
-
-```python
-await navigate('https://example.com/products')
-await asyncio.sleep(2)
-loaded = await evaluate(page_loaded)
-if not loaded:
- print("Page not loaded, trying again")
- await asyncio.sleep(1)
-
-```
-### Receive current browser state after cell execution - analyse it.
-
-### Step 2: Dismiss Modals
-```js dismiss_overlays
-(function(){
- document.querySelectorAll('button[id*="accept"]').forEach(b => b.click());
- document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape'}));
- return 'dismissed';
-})()
-```
-
-```python
-await evaluate(dismiss_overlays)
-```
-
-### Step 3: Apply Filters
-```python
-await select_dropdown(index=123, text="Under $50")
-await click(index=456) # Apply filters button
-```
-
-### Step 4: Explore - Test Single Element
-```js test_single_element
-(function(){
- const first = document.querySelector('.product');
- return {
- html: first?.outerHTML.substring(0, 300),
- name: first?.querySelector('.name')?.textContent,
- price: first?.querySelector('.price')?.textContent
- };
-})()
-```
-
-```js find_heading_by_text
-(function(){
- const headings = Array.from(document.querySelectorAll('h2, h3'));
- const target = headings.find(h => h.textContent.includes('Full Year 2024'));
- return target ? target.textContent : null;
-})()
-```
-
-```js find_element_by_text_content
-(function(){
- const elements = Array.from(document.querySelectorAll('dt'));
- const locationLabel = elements.find(el => el.textContent.includes('Location'));
- const nextSibling = locationLabel?.nextElementSibling;
- return nextSibling ? nextSibling.textContent : null;
-})()
-```
-
-```js get_product_urls
-(function(){
- return Array.from(document.querySelectorAll('a[href*="product"]').slice(0, 10)).map(a => a.href);
-})()
-```
-
-```python
-# load more
-scroll(down=True, pages=3.0)
-await asyncio.sleep(0.5)
-scroll(down=False, pages=2.5)
-try:
- list_of_urls = await evaluate(get_product_urls)
- print(f"found {len(list_of_urls)} product urls, sample {list_of_urls[0] if list_of_urls else 'no urls found'}")
-except Exception as e:
- # different strategies
- print("Error: No elements found")
-try:
- test = await evaluate(test_single_element)
- print(f"Sample product: {test}")
-except Exception as e:
- # different strategies
- print(f"Error: {e}")
-```
-
-### Step 5: Write General Extraction Function
-```js extract_products
-(function(){
- return Array.from(document.querySelectorAll('.product')).map(p => ({
- name: p.querySelector('.name')?.textContent?.trim(),
- price: p.querySelector('.price')?.textContent?.trim(),
- url: p.querySelector('a')?.href
- })).filter(p => p.name && p.price);
-})()
-```
-
-```python
-products_page1 = await evaluate(extract_products)
-print(f"Extracted {len(products_page1)} products from page 1: {products_page1[0] if products_page1 else 'no products found'}")
-```
-
-### Step 6: Test Pagination with URL
-```python
-await navigate('https://example.com/products?page=2')
-await asyncio.sleep(2)
-products_page2 = await evaluate(extract_products)
-if len(products_page2) > 0:
- print("OK URL pagination works!")
-```
-
-### Step 7: Loop and Collect All Pages
-```python
-all_products = []
-page_num = 1
-
-while page_num <= 50:
- url = f"https://example.com/products?page={page_num}"
- await navigate(url)
- await asyncio.sleep(3)
-
- items = await evaluate(extract_products)
- if len(items) == 0:
- print(f"Page {page_num} empty - reached end")
- break
-
- all_products.extend(items)
- print(f"Page {page_num}: {len(items)} items. Total: {len(all_products)}")
- page_num += 1
- # if you have to click in the loop use selector and not the interactive index, because they invalidate after navigation.
-```
-
-### Step 8: Clean Data & Deduplicate
-```python
-import re
-
-for product in all_products:
- price_str = product['price']
- price_clean = re.sub(r'[^0-9.]', '', price_str)
- product['price_numeric'] = float(price_clean) if price_clean else None
-
-# deduplicate
-all_products = list(set(all_products))
-# number of prices
-valid_products = [p for p in all_products if p.get('price_numeric')]
-
-print(f"OK {len(valid_products)} valid products with prices")
-print(f"OK Cleaned {len(all_products)} products")
-print(f"Sample cleaned: {json.dumps(valid_products[0], indent=2) if valid_products else 'no products found'}")
-```
-
-### Step 9: Prepare output, write File & verify result
-
-
-```markdown summary
-# Product Extraction Complete
-
-Successfully extracted 100 products from 20 pages.
-
-Full data saved to: products.json.
-
-```
-```python
-
-with open('products.json', 'w', encoding='utf-8') as f:
- json.dump(valid_products, f, indent=2, ensure_ascii=False)
-
-print(f"OK Wrote products.json ({len(valid_products)} products)")
-sample = json.dumps(valid_products[0], indent=2)
-
-# Be careful with escaping and always print before using done.
-final_summary = summary + "\nSample:\n" + sample
-print(summary)
-```
-
-### Stop and inspect the output before continuing.
-### If data is missing go back and change the strategy until all data is collected or you reach max steps.
-
-### Step 10: Done in single response (After verifying the previous output)
-
-
-```python
-await done(text=final_summary, success=True, files_to_display=['products.json'])
-```
-
----
-
-## CRITICAL RULES
-
-1. **NO `global` keyword** - Variables persist automatically
-2. **No comments** in Python or JavaScript code, write concise code.
-3. **Verify results after search** - Check result count > 0
-4. **Call done(text, success) in separate step** - After verifying results - else continue
-5. **Write structured data to files** - Never embed in markdown
-6. Do not use jQuery.
-7. Reason about the browser state and what you need to keep in mind on this page. E.g. popups, dynamic content, closed shadow DOM, iframes, scroll to load more...
-8. If selectors fail, simply try different once. Print many and then try different strategies.
----
-
-## Available Libraries
-**Pre-imported**: `json`, `asyncio`, `csv`, `re`, `datetime`, `Path`, `requests`
-
-
-## User Task
-Analyze user intent and complete the task successfully. Do not stop until completed.
-Respond in the format the user requested.
diff --git a/browser_use/code_use/utils.py b/browser_use/code_use/utils.py
deleted file mode 100644
index 8c00193fd..000000000
--- a/browser_use/code_use/utils.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""Utility functions for code-use agent."""
-
-import re
-
-
-def truncate_message_content(content: str, max_length: int = 10000) -> str:
- """Truncate message content to max_length characters for history."""
- if len(content) <= max_length:
- return content
- # Truncate and add marker
- return content[:max_length] + f'\n\n[... truncated {len(content) - max_length} characters for history]'
-
-
-def detect_token_limit_issue(
- completion: str,
- completion_tokens: int | None,
- max_tokens: int | None,
- stop_reason: str | None,
-) -> tuple[bool, str | None]:
- """
- Detect if the LLM response hit token limits or is repetitive garbage.
-
- Returns: (is_problematic, error_message)
- """
- # Check 1: Stop reason indicates max_tokens
- if stop_reason == 'max_tokens':
- return True, f'Response terminated due to max_tokens limit (stop_reason: {stop_reason})'
-
- # Check 2: Used 90%+ of max_tokens (if we have both values)
- if completion_tokens is not None and max_tokens is not None and max_tokens > 0:
- usage_ratio = completion_tokens / max_tokens
- if usage_ratio >= 0.9:
- return True, f'Response used {usage_ratio:.1%} of max_tokens ({completion_tokens}/{max_tokens})'
-
- # Check 3: Last 6 characters repeat 40+ times (repetitive garbage)
- if len(completion) >= 6:
- last_6 = completion[-6:]
- repetition_count = completion.count(last_6)
- if repetition_count >= 40:
- return True, f'Repetitive output detected: last 6 chars "{last_6}" appears {repetition_count} times'
-
- return False, None
-
-
-def extract_url_from_task(task: str) -> str | None:
- """Extract URL from task string using naive pattern matching."""
- # Remove email addresses from task before looking for URLs
- task_without_emails = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', task)
-
- # Look for common URL patterns
- patterns = [
- r'https?://[^\s<>"\']+', # Full URLs with http/https
- r'(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?', # Domain names with subdomains and optional paths
- ]
-
- found_urls = []
- for pattern in patterns:
- matches = re.finditer(pattern, task_without_emails)
- for match in matches:
- url = match.group(0)
-
- # Remove trailing punctuation that's not part of URLs
- url = re.sub(r'[.,;:!?()\[\]]+$', '', url)
- # Add https:// if missing
- if not url.startswith(('http://', 'https://')):
- url = 'https://' + url
- found_urls.append(url)
-
- unique_urls = list(set(found_urls))
- # If multiple URLs found, skip auto-navigation to avoid ambiguity
- if len(unique_urls) > 1:
- return None
-
- # If exactly one URL found, return it
- if len(unique_urls) == 1:
- return unique_urls[0]
-
- return None
-
-
-def extract_code_blocks(text: str) -> dict[str, str]:
- """Extract all code blocks from markdown response.
-
- Supports:
- - ```python, ```js, ```javascript, ```bash, ```markdown, ```md
- - Named blocks: ```js variable_name → saved as 'variable_name' in namespace
- - Nested blocks: Use 4+ backticks for outer block when inner content has 3 backticks
-
- Returns dict mapping block_name -> content
-
- Note: Python blocks are NO LONGER COMBINED. Each python block executes separately
- to allow sequential execution with JS/bash blocks in between.
- """
- # Pattern to match code blocks with language identifier and optional variable name
- # Matches: ```lang\n or ```lang varname\n or ````+lang\n (4+ backticks for nested blocks)
- # Uses non-greedy matching and backreferences to match opening/closing backticks
- pattern = r'(`{3,})(\w+)(?:\s+(\w+))?\n(.*?)\1(?:\n|$)'
- matches = re.findall(pattern, text, re.DOTALL)
-
- blocks: dict[str, str] = {}
- python_block_counter = 0
-
- for backticks, lang, var_name, content in matches:
- lang = lang.lower()
-
- # Normalize language names
- if lang in ('javascript', 'js'):
- lang_normalized = 'js'
- elif lang in ('markdown', 'md'):
- lang_normalized = 'markdown'
- elif lang in ('sh', 'shell'):
- lang_normalized = 'bash'
- elif lang == 'python':
- lang_normalized = 'python'
- else:
- # Unknown language, skip
- continue
-
- # Only process supported types
- if lang_normalized in ('python', 'js', 'bash', 'markdown'):
- content = content.rstrip() # Only strip trailing whitespace, preserve leading for indentation
- if content:
- # Determine the key to use
- if var_name:
- # Named block - use the variable name
- block_key = var_name
- blocks[block_key] = content
- elif lang_normalized == 'python':
- # Unnamed Python blocks - give each a unique key to preserve order
- block_key = f'python_{python_block_counter}'
- blocks[block_key] = content
- python_block_counter += 1
- else:
- # Other unnamed blocks (js, bash, markdown) - keep last one only
- blocks[lang_normalized] = content
-
- # If we have multiple python blocks, mark the first one as 'python' for backward compat
- if python_block_counter > 0:
- blocks['python'] = blocks['python_0']
-
- # Fallback: if no python block but there's generic ``` block, treat as python
- if python_block_counter == 0 and 'python' not in blocks:
- generic_pattern = r'```\n(.*?)```'
- generic_matches = re.findall(generic_pattern, text, re.DOTALL)
- if generic_matches:
- combined = '\n\n'.join(m.strip() for m in generic_matches if m.strip())
- if combined:
- blocks['python'] = combined
-
- return blocks
diff --git a/browser_use/code_use/views.py b/browser_use/code_use/views.py
deleted file mode 100644
index 676fa4ab2..000000000
--- a/browser_use/code_use/views.py
+++ /dev/null
@@ -1,403 +0,0 @@
-"""Data models for code-use mode."""
-
-from __future__ import annotations
-
-import json
-from enum import Enum
-from pathlib import Path
-from typing import Any
-
-from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
-from uuid_extensions import uuid7str
-
-from browser_use.tokens.views import UsageSummary
-
-
-class CellType(str, Enum):
- """Type of notebook cell."""
-
- CODE = 'code'
- MARKDOWN = 'markdown'
-
-
-class ExecutionStatus(str, Enum):
- """Execution status of a cell."""
-
- PENDING = 'pending'
- RUNNING = 'running'
- SUCCESS = 'success'
- ERROR = 'error'
-
-
-class CodeCell(BaseModel):
- """Represents a code cell in the notebook-like execution."""
-
- model_config = ConfigDict(extra='forbid')
-
- id: str = Field(default_factory=uuid7str)
- cell_type: CellType = CellType.CODE
- source: str = Field(description='The code to execute')
- output: str | None = Field(default=None, description='The output of the code execution')
- execution_count: int | None = Field(default=None, description='The execution count')
- status: ExecutionStatus = Field(default=ExecutionStatus.PENDING)
- error: str | None = Field(default=None, description='Error message if execution failed')
- browser_state: str | None = Field(default=None, description='Browser state after execution')
-
-
-class NotebookSession(BaseModel):
- """Represents a notebook-like session."""
-
- model_config = ConfigDict(extra='forbid')
-
- id: str = Field(default_factory=uuid7str)
- cells: list[CodeCell] = Field(default_factory=list)
- current_execution_count: int = Field(default=0)
- namespace: dict[str, Any] = Field(default_factory=dict, description='Current namespace state')
- _complete_history: list[CodeAgentHistory] = PrivateAttr(default_factory=list)
- _usage_summary: UsageSummary | None = PrivateAttr(default=None)
-
- def add_cell(self, source: str) -> CodeCell:
- """Add a new code cell to the session."""
- cell = CodeCell(source=source)
- self.cells.append(cell)
- return cell
-
- def get_cell(self, cell_id: str) -> CodeCell | None:
- """Get a cell by ID."""
- for cell in self.cells:
- if cell.id == cell_id:
- return cell
- return None
-
- def get_latest_cell(self) -> CodeCell | None:
- """Get the most recently added cell."""
- if self.cells:
- return self.cells[-1]
- return None
-
- def increment_execution_count(self) -> int:
- """Increment and return the execution count."""
- self.current_execution_count += 1
- return self.current_execution_count
-
- @property
- def history(self) -> CodeAgentHistoryList:
- """Get the history as an AgentHistoryList-compatible object."""
- return CodeAgentHistoryList(self._complete_history, self._usage_summary)
-
-
-class NotebookExport(BaseModel):
- """Export format for Jupyter notebook."""
-
- model_config = ConfigDict(extra='forbid')
-
- nbformat: int = Field(default=4)
- nbformat_minor: int = Field(default=5)
- metadata: dict[str, Any] = Field(default_factory=dict)
- cells: list[dict[str, Any]] = Field(default_factory=list)
-
-
-class CodeAgentModelOutput(BaseModel):
- """Model output for CodeAgent - contains the code and full LLM response."""
-
- model_config = ConfigDict(extra='forbid')
-
- model_output: str = Field(description='The extracted code from the LLM response')
- full_response: str = Field(description='The complete LLM response including any text/reasoning')
-
-
-class CodeAgentResult(BaseModel):
- """Result of executing a code cell in CodeAgent."""
-
- model_config = ConfigDict(extra='forbid')
-
- extracted_content: str | None = Field(default=None, description='Output from code execution')
- error: str | None = Field(default=None, description='Error message if execution failed')
- is_done: bool = Field(default=False, description='Whether task is marked as done')
- success: bool | None = Field(default=None, description='Self-reported success from done() call')
-
-
-class CodeAgentState(BaseModel):
- """State information for a CodeAgent step."""
-
- model_config = ConfigDict(extra='forbid', arbitrary_types_allowed=True)
-
- url: str | None = Field(default=None, description='Current page URL')
- title: str | None = Field(default=None, description='Current page title')
- screenshot_path: str | None = Field(default=None, description='Path to screenshot file')
-
- def get_screenshot(self) -> str | None:
- """Load screenshot from disk and return as base64 string."""
- if not self.screenshot_path:
- return None
-
- import base64
- from pathlib import Path
-
- path_obj = Path(self.screenshot_path)
- if not path_obj.exists():
- return None
-
- try:
- with open(path_obj, 'rb') as f:
- screenshot_data = f.read()
- return base64.b64encode(screenshot_data).decode('utf-8')
- except Exception:
- return None
-
-
-class CodeAgentStepMetadata(BaseModel):
- """Metadata for a single CodeAgent step including timing and token information."""
-
- model_config = ConfigDict(extra='forbid')
-
- input_tokens: int | None = Field(default=None, description='Number of input tokens used')
- output_tokens: int | None = Field(default=None, description='Number of output tokens used')
- step_start_time: float = Field(description='Step start timestamp (Unix time)')
- step_end_time: float = Field(description='Step end timestamp (Unix time)')
-
- @property
- def duration_seconds(self) -> float:
- """Calculate step duration in seconds."""
- return self.step_end_time - self.step_start_time
-
-
-class CodeAgentHistory(BaseModel):
- """History item for CodeAgent actions."""
-
- model_config = ConfigDict(extra='forbid', arbitrary_types_allowed=True)
-
- model_output: CodeAgentModelOutput | None = Field(default=None, description='LLM output for this step')
- result: list[CodeAgentResult] = Field(default_factory=list, description='Results from code execution')
- state: CodeAgentState = Field(description='Browser state at this step')
- metadata: CodeAgentStepMetadata | None = Field(default=None, description='Step timing and token metadata')
- screenshot_path: str | None = Field(default=None, description='Legacy field for screenshot path')
-
- def model_dump(self, **kwargs) -> dict[str, Any]:
- """Custom serialization for CodeAgentHistory."""
- return {
- 'model_output': self.model_output.model_dump() if self.model_output else None,
- 'result': [r.model_dump() for r in self.result],
- 'state': self.state.model_dump(),
- 'metadata': self.metadata.model_dump() if self.metadata else None,
- 'screenshot_path': self.screenshot_path,
- }
-
-
-class CodeAgentHistoryList:
- """Compatibility wrapper for CodeAgentHistory that provides AgentHistoryList-like API."""
-
- def __init__(self, complete_history: list[CodeAgentHistory], usage_summary: UsageSummary | None) -> None:
- """Initialize with CodeAgent history data."""
- self._complete_history = complete_history
- self._usage_summary = usage_summary
-
- @property
- def history(self) -> list[CodeAgentHistory]:
- """Get the raw history list."""
- return self._complete_history
-
- @property
- def usage(self) -> UsageSummary | None:
- """Get the usage summary."""
- return self._usage_summary
-
- def __len__(self) -> int:
- """Return the number of history items."""
- return len(self._complete_history)
-
- def __str__(self) -> str:
- """Representation of the CodeAgentHistoryList object."""
- return f'CodeAgentHistoryList(steps={len(self._complete_history)}, action_results={len(self.action_results())})'
-
- def __repr__(self) -> str:
- """Representation of the CodeAgentHistoryList object."""
- return self.__str__()
-
- def final_result(self) -> None | str:
- """Final result from history."""
- if self._complete_history and self._complete_history[-1].result:
- return self._complete_history[-1].result[-1].extracted_content
- return None
-
- def is_done(self) -> bool:
- """Check if the agent is done."""
- if self._complete_history and len(self._complete_history[-1].result) > 0:
- last_result = self._complete_history[-1].result[-1]
- return last_result.is_done is True
- return False
-
- def is_successful(self) -> bool | None:
- """Check if the agent completed successfully."""
- if self._complete_history and len(self._complete_history[-1].result) > 0:
- last_result = self._complete_history[-1].result[-1]
- if last_result.is_done is True:
- return last_result.success
- return None
-
- def errors(self) -> list[str | None]:
- """Get all errors from history, with None for steps without errors."""
- errors = []
- for h in self._complete_history:
- step_errors = [r.error for r in h.result if r.error]
- # each step can have only one error
- errors.append(step_errors[0] if step_errors else None)
- return errors
-
- def has_errors(self) -> bool:
- """Check if the agent has any non-None errors."""
- return any(error is not None for error in self.errors())
-
- def urls(self) -> list[str | None]:
- """Get all URLs from history."""
- return [h.state.url if h.state.url is not None else None for h in self._complete_history]
-
- def screenshot_paths(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
- """Get all screenshot paths from history."""
- if n_last == 0:
- return []
- if n_last is None:
- if return_none_if_not_screenshot:
- return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self._complete_history]
- else:
- return [h.state.screenshot_path for h in self._complete_history if h.state.screenshot_path is not None]
- else:
- if return_none_if_not_screenshot:
- return [
- h.state.screenshot_path if h.state.screenshot_path is not None else None
- for h in self._complete_history[-n_last:]
- ]
- else:
- return [h.state.screenshot_path for h in self._complete_history[-n_last:] if h.state.screenshot_path is not None]
-
- def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
- """Get all screenshots from history as base64 strings."""
- if n_last == 0:
- return []
- history_items = self._complete_history if n_last is None else self._complete_history[-n_last:]
- screenshots = []
- for item in history_items:
- screenshot_b64 = item.state.get_screenshot()
- if screenshot_b64:
- screenshots.append(screenshot_b64)
- else:
- if return_none_if_not_screenshot:
- screenshots.append(None)
- return screenshots
-
- def action_results(self) -> list[CodeAgentResult]:
- """Get all results from history."""
- results = []
- for h in self._complete_history:
- results.extend([r for r in h.result if r])
- return results
-
- def extracted_content(self) -> list[str]:
- """Get all extracted content from history."""
- content = []
- for h in self._complete_history:
- content.extend([r.extracted_content for r in h.result if r.extracted_content])
- return content
-
- def number_of_steps(self) -> int:
- """Get the number of steps in the history."""
- return len(self._complete_history)
-
- def total_duration_seconds(self) -> float:
- """Get total duration of all steps in seconds."""
- total = 0.0
- for h in self._complete_history:
- if h.metadata:
- total += h.metadata.duration_seconds
- return total
-
- def last_action(self) -> None | dict:
- """Last action in history - returns the last code execution."""
- if self._complete_history and self._complete_history[-1].model_output:
- return {
- 'execute_code': {
- 'code': self._complete_history[-1].model_output.model_output,
- 'full_response': self._complete_history[-1].model_output.full_response,
- }
- }
- return None
-
- def action_names(self) -> list[str]:
- """Get all action names from history - returns 'execute_code' for each code execution."""
- action_names = []
- for action in self.model_actions():
- actions = list(action.keys())
- if actions:
- action_names.append(actions[0])
- return action_names
-
- def model_thoughts(self) -> list[Any]:
- """Get all thoughts from history - returns model_output for CodeAgent."""
- return [h.model_output for h in self._complete_history if h.model_output]
-
- def model_outputs(self) -> list[CodeAgentModelOutput]:
- """Get all model outputs from history."""
- return [h.model_output for h in self._complete_history if h.model_output]
-
- def model_actions(self) -> list[dict]:
- """Get all actions from history - returns code execution actions with their code."""
- actions = []
- for h in self._complete_history:
- if h.model_output:
- # Create one action dict per result (code execution)
- for _ in h.result:
- action_dict = {
- 'execute_code': {
- 'code': h.model_output.model_output,
- 'full_response': h.model_output.full_response,
- }
- }
- actions.append(action_dict)
- return actions
-
- def action_history(self) -> list[list[dict]]:
- """Get truncated action history grouped by step."""
- step_outputs = []
- for h in self._complete_history:
- step_actions = []
- if h.model_output:
- for result in h.result:
- action_dict = {
- 'execute_code': {
- 'code': h.model_output.model_output,
- },
- 'result': {
- 'extracted_content': result.extracted_content,
- 'is_done': result.is_done,
- 'success': result.success,
- 'error': result.error,
- },
- }
- step_actions.append(action_dict)
- step_outputs.append(step_actions)
- return step_outputs
-
- def model_actions_filtered(self, include: list[str] | None = None) -> list[dict]:
- """Get all model actions from history filtered - returns empty for CodeAgent."""
- return []
-
- def add_item(self, history_item: CodeAgentHistory) -> None:
- """Add a history item to the list."""
- self._complete_history.append(history_item)
-
- def model_dump(self, **kwargs) -> dict[str, Any]:
- """Custom serialization for CodeAgentHistoryList."""
- return {
- 'history': [h.model_dump(**kwargs) for h in self._complete_history],
- 'usage': self._usage_summary.model_dump() if self._usage_summary else None,
- }
-
- def save_to_file(self, filepath: str | Path, sensitive_data: dict[str, str | dict[str, str]] | None = None) -> None:
- """Save history to JSON file."""
- try:
- Path(filepath).parent.mkdir(parents=True, exist_ok=True)
- data = self.model_dump()
- with open(filepath, 'w', encoding='utf-8') as f:
- json.dump(data, f, indent=2)
- except Exception as e:
- raise e
diff --git a/browser_use/config.py b/browser_use/config.py
index a951c4a39..3452df4ec 100644
--- a/browser_use/config.py
+++ b/browser_use/config.py
@@ -76,6 +76,13 @@ class OldConfig:
raise AssertionError('BROWSER_USE_CLOUD_UI_URL must be a valid URL if set')
return url
+ @property
+ def BROWSER_USE_MODEL_PRICING_URL(self) -> str:
+ url = os.getenv('BROWSER_USE_MODEL_PRICING_URL', '')
+ if url and '://' not in url:
+ raise AssertionError('BROWSER_USE_MODEL_PRICING_URL must be a valid URL if set')
+ return url
+
# Path configuration
@property
def XDG_CACHE_HOME(self) -> Path:
@@ -195,6 +202,7 @@ class FlatEnvConfig(BaseSettings):
BROWSER_USE_CLOUD_SYNC: bool | None = Field(default=None)
BROWSER_USE_CLOUD_API_URL: str = Field(default='https://api.browser-use.com')
BROWSER_USE_CLOUD_UI_URL: str = Field(default='')
+ BROWSER_USE_MODEL_PRICING_URL: str = Field(default='')
# Path configuration
XDG_CACHE_HOME: str = Field(default='~/.cache')
diff --git a/browser_use/dom/enhanced_snapshot.py b/browser_use/dom/enhanced_snapshot.py
index f9433145c..ecb110c6d 100644
--- a/browser_use/dom/enhanced_snapshot.py
+++ b/browser_use/dom/enhanced_snapshot.py
@@ -9,7 +9,6 @@ from cdp_use.cdp.domsnapshot.commands import CaptureSnapshotReturns
from cdp_use.cdp.domsnapshot.types import (
LayoutTreeSnapshot,
NodeTreeSnapshot,
- RareBooleanData,
)
from browser_use.dom.views import DOMRect, EnhancedSnapshotNode
@@ -30,9 +29,9 @@ REQUIRED_COMPUTED_STYLES = [
]
-def _parse_rare_boolean_data(rare_data: RareBooleanData, index: int) -> bool | None:
- """Parse rare boolean data from snapshot - returns True if index is in the rare data."""
- return index in rare_data['index']
+def _parse_rare_boolean_data(rare_data_set: set[int], index: int) -> bool | None:
+ """Parse rare boolean data from snapshot - returns True if index is in the rare data set."""
+ return index in rare_data_set
def _parse_computed_styles(strings: list[str], style_indices: list[int]) -> dict[str, str]:
@@ -85,11 +84,18 @@ def build_snapshot_lookup(
if node_index not in layout_index_map: # Only store first occurrence
layout_index_map[node_index] = layout_idx
+ # Pre-convert rare boolean data from list to set for O(1) lookups.
+ # The raw CDP data uses List[int] which makes `index in list` O(n).
+ # Called once per node, this was O(n²) total — the #1 bottleneck.
+ # At 20k elements: 5,925ms (list) → 2ms (set) = 3,000x speedup.
+ has_clickable_data = 'isClickable' in nodes
+ is_clickable_set: set[int] = set(nodes['isClickable']['index']) if has_clickable_data else set()
+
# Build snapshot lookup for each backend node id
for backend_node_id, snapshot_index in backend_node_to_snapshot_index.items():
is_clickable = None
- if 'isClickable' in nodes:
- is_clickable = _parse_rare_boolean_data(nodes['isClickable'], snapshot_index)
+ if has_clickable_data:
+ is_clickable = _parse_rare_boolean_data(is_clickable_set, snapshot_index)
# Find corresponding layout node
cursor_style = None
diff --git a/browser_use/dom/markdown_extractor.py b/browser_use/dom/markdown_extractor.py
index bb1fe784d..aff9736d4 100644
--- a/browser_use/dom/markdown_extractor.py
+++ b/browser_use/dom/markdown_extractor.py
@@ -24,6 +24,7 @@ async def extract_clean_markdown(
dom_service: DomService | None = None,
target_id: str | None = None,
extract_links: bool = False,
+ extract_images: bool = False,
) -> tuple[str, dict[str, Any]]:
"""Extract clean markdown from browser content using enhanced DOM tree.
@@ -35,6 +36,7 @@ async def extract_clean_markdown(
dom_service: DOM service instance (page actor path)
target_id: Target ID for the page (required when using dom_service)
extract_links: Whether to preserve links in markdown
+ extract_images: Whether to preserve inline image src URLs in markdown
Returns:
tuple: (clean_markdown_content, content_statistics)
@@ -68,6 +70,9 @@ async def extract_clean_markdown(
# Use markdownify for clean markdown conversion
from markdownify import markdownify as md
+ # 'td', 'th', and headings are the only elements where markdownify sets the _inline context,
+ # which causes img elements to be stripped to just alt text when keep_inline_images_in=[]
+ _keep_inline_images_in = ['td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] if extract_images else []
content = md(
page_html,
heading_style='ATX', # Use # style headings
@@ -79,7 +84,7 @@ async def extract_clean_markdown(
escape_misc=False, # Don't escape other characters (cleaner output)
autolinks=False, # Don't convert URLs to <> format
default_title=False, # Don't add default title attributes
- keep_inline_images_in=[], # Don't keep inline images in any tags (we already filter base64 in HTML)
+ keep_inline_images_in=_keep_inline_images_in, # Include image src URLs when extract_images=True
)
initial_markdown_length = len(content)
diff --git a/browser_use/dom/serializer/code_use_serializer.py b/browser_use/dom/serializer/code_use_serializer.py
deleted file mode 100644
index b127b576b..000000000
--- a/browser_use/dom/serializer/code_use_serializer.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# @file purpose: Ultra-compact serializer optimized for code-use agents
-# Focuses on minimal token usage while preserving essential interactive context
-
-from browser_use.dom.utils import cap_text_length
-from browser_use.dom.views import (
- EnhancedDOMTreeNode,
- NodeType,
- SimplifiedNode,
-)
-
-# Minimal but sufficient attribute list for code agents
-CODE_USE_KEY_ATTRIBUTES = [
- 'id', # Essential for element selection
- 'name', # For form inputs
- 'type', # For input types
- 'placeholder', # For empty inputs
- 'aria-label', # For buttons without text
- 'value', # Current values
- 'alt', # For images
- 'class', # Keep top 2 classes for common selectors
-]
-
-# Interactive elements agent can use
-INTERACTIVE_ELEMENTS = {
- 'a',
- 'button',
- 'input',
- 'textarea',
- 'select',
- 'form',
-}
-
-# Semantic structure elements - expanded to include more content containers
-SEMANTIC_STRUCTURE = {
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'h6',
- 'nav',
- 'main',
- 'header',
- 'footer',
- 'article',
- 'section',
- 'p', # Paragraphs often contain prices and product info
- 'span', # Spans often contain prices and labels
- 'div', # Divs with useful attributes (id/class) should be shown
- 'ul',
- 'ol',
- 'li',
- 'label',
- 'img',
-}
-
-
-class DOMCodeAgentSerializer:
- """Optimized DOM serializer for code-use agents - balances token efficiency with context."""
-
- @staticmethod
- def serialize_tree(node: SimplifiedNode | None, include_attributes: list[str], depth: int = 0) -> str:
- """
- Serialize DOM tree with smart token optimization.
-
- Strategy:
- - Keep top 2 CSS classes for querySelector compatibility
- - Show div/span/p elements with useful attributes or text
- - Show all interactive + semantic elements
- - Inline text up to 80 chars for better context
- """
- if not node:
- return ''
-
- # Skip excluded/hidden nodes
- if hasattr(node, 'excluded_by_parent') and node.excluded_by_parent:
- return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
-
- if not node.should_display:
- return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
-
- formatted_text = []
- depth_str = ' ' * depth # Use 2 spaces instead of tabs for compactness
-
- if node.original_node.node_type == NodeType.ELEMENT_NODE:
- tag = node.original_node.tag_name.lower()
- is_visible = node.original_node.snapshot_node and node.original_node.is_visible
-
- # Skip invisible (except iframes)
- if not is_visible and tag not in ['iframe', 'frame']:
- return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
-
- # Special handling for iframes
- if tag in ['iframe', 'frame']:
- return DOMCodeAgentSerializer._serialize_iframe(node, include_attributes, depth)
-
- # Build minimal attributes
- attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
-
- # Decide if element should be shown
- is_interactive = tag in INTERACTIVE_ELEMENTS
- is_semantic = tag in SEMANTIC_STRUCTURE
- has_useful_attrs = bool(attributes_str)
- has_text = DOMCodeAgentSerializer._has_direct_text(node)
-
- # Skip non-semantic, non-interactive containers without attributes
- if not is_interactive and not is_semantic and not has_useful_attrs and not has_text:
- return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
-
- # Collapse pointless wrappers
- if tag in {'div', 'span'} and not has_useful_attrs and not has_text and len(node.children) == 1:
- return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
-
- # Build element
- line = f'{depth_str}<{tag}'
-
- if attributes_str:
- line += f' {attributes_str}'
-
- # Inline text
- inline_text = DOMCodeAgentSerializer._get_inline_text(node)
- if inline_text:
- line += f'>{inline_text}'
- else:
- line += '>'
-
- formatted_text.append(line)
-
- # Children (only if no inline text)
- if node.children and not inline_text:
- children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
- if children_text:
- formatted_text.append(children_text)
-
- elif node.original_node.node_type == NodeType.TEXT_NODE:
- # Handled inline with parent
- pass
-
- elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
- # Shadow DOM - minimal marker
- if node.children:
- formatted_text.append(f'{depth_str}#shadow')
- children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
- if children_text:
- formatted_text.append(children_text)
-
- return '\n'.join(formatted_text)
-
- @staticmethod
- def _serialize_children(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
- """Serialize children."""
- children_output = []
- for child in node.children:
- child_text = DOMCodeAgentSerializer.serialize_tree(child, include_attributes, depth)
- if child_text:
- children_output.append(child_text)
- return '\n'.join(children_output)
-
- @staticmethod
- def _build_minimal_attributes(node: EnhancedDOMTreeNode) -> str:
- """Build minimal but useful attributes - keep top 2 classes for selectors."""
- attrs = []
-
- if node.attributes:
- for attr in CODE_USE_KEY_ATTRIBUTES:
- if attr in node.attributes:
- value = str(node.attributes[attr]).strip()
- if value:
- # Special handling for class - keep only first 2 classes
- if attr == 'class':
- classes = value.split()[:2]
- value = ' '.join(classes)
- # Cap at 25 chars
- value = cap_text_length(value, 25)
- attrs.append(f'{attr}="{value}"')
-
- return ' '.join(attrs)
-
- @staticmethod
- def _has_direct_text(node: SimplifiedNode) -> bool:
- """Check if node has direct text children."""
- for child in node.children:
- if child.original_node.node_type == NodeType.TEXT_NODE:
- text = child.original_node.node_value.strip() if child.original_node.node_value else ''
- if len(text) > 1:
- return True
- return False
-
- @staticmethod
- def _get_inline_text(node: SimplifiedNode) -> str:
- """Get inline text (max 80 chars for better context)."""
- text_parts = []
- for child in node.children:
- if child.original_node.node_type == NodeType.TEXT_NODE:
- text = child.original_node.node_value.strip() if child.original_node.node_value else ''
- if text and len(text) > 1:
- text_parts.append(text)
-
- if not text_parts:
- return ''
-
- combined = ' '.join(text_parts)
- return cap_text_length(combined, 40)
-
- @staticmethod
- def _serialize_iframe(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
- """Handle iframe minimally."""
- formatted_text = []
- depth_str = ' ' * depth
- tag = node.original_node.tag_name.lower()
-
- # Minimal iframe marker
- attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
- line = f'{depth_str}<{tag}'
- if attributes_str:
- line += f' {attributes_str}'
- line += '>'
- formatted_text.append(line)
-
- # Iframe content
- if node.original_node.content_document:
- formatted_text.append(f'{depth_str} #iframe-content')
-
- # Find and serialize body content only
- for child_node in node.original_node.content_document.children_nodes or []:
- if child_node.tag_name.lower() == 'html':
- for html_child in child_node.children:
- if html_child.tag_name.lower() == 'body':
- for body_child in html_child.children:
- DOMCodeAgentSerializer._serialize_document_node(
- body_child, formatted_text, include_attributes, depth + 2
- )
- break
-
- return '\n'.join(formatted_text)
-
- @staticmethod
- def _serialize_document_node(
- dom_node: EnhancedDOMTreeNode, output: list[str], include_attributes: list[str], depth: int
- ) -> None:
- """Serialize document node without SimplifiedNode wrapper."""
- depth_str = ' ' * depth
-
- if dom_node.node_type == NodeType.ELEMENT_NODE:
- tag = dom_node.tag_name.lower()
-
- # Skip invisible
- is_visible = dom_node.snapshot_node and dom_node.is_visible
- if not is_visible:
- return
-
- # Check if worth showing
- is_interactive = tag in INTERACTIVE_ELEMENTS
- is_semantic = tag in SEMANTIC_STRUCTURE
- attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(dom_node)
-
- if not is_interactive and not is_semantic and not attributes_str:
- # Skip but process children
- for child in dom_node.children:
- DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth)
- return
-
- # Build element
- line = f'{depth_str}<{tag}'
- if attributes_str:
- line += f' {attributes_str}'
-
- # Get text
- text_parts = []
- for child in dom_node.children:
- if child.node_type == NodeType.TEXT_NODE and child.node_value:
- text = child.node_value.strip()
- if text and len(text) > 1:
- text_parts.append(text)
-
- if text_parts:
- combined = ' '.join(text_parts)
- line += f'>{cap_text_length(combined, 25)}'
- else:
- line += '>'
-
- output.append(line)
-
- # Process non-text children
- for child in dom_node.children:
- if child.node_type != NodeType.TEXT_NODE:
- DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth + 1)
diff --git a/browser_use/dom/serializer/paint_order.py b/browser_use/dom/serializer/paint_order.py
index d82d14401..61032e209 100644
--- a/browser_use/dom/serializer/paint_order.py
+++ b/browser_use/dom/serializer/paint_order.py
@@ -36,10 +36,21 @@ class RectUnionPure:
"""
Maintains a *disjoint* set of rectangles.
No external dependencies - fine for a few thousand rectangles.
+
+ A safety cap (_MAX_RECTS) prevents exponential explosion on pages with
+ many overlapping translucent layers. Once the cap is hit, contains()
+ conservatively returns False (i.e. nothing is hidden), preserving
+ correctness at the cost of less aggressive paint-order filtering.
"""
__slots__ = ('_rects',)
+ # Safety cap: with complex overlapping layers, each add() can fragment
+ # existing rects into up to 4 pieces each. On heavy pages (20k+ elements)
+ # this can cause exponential growth. 5000 is generous enough for normal
+ # pages but prevents runaway memory/CPU.
+ _MAX_RECTS = 5000
+
def __init__(self):
self._rects: list[Rect] = []
@@ -101,6 +112,10 @@ class RectUnionPure:
Insert r unless it is already covered.
Returns True if the union grew.
"""
+ # Safety cap: stop accepting new rects to prevent exponential explosion
+ if len(self._rects) >= self._MAX_RECTS:
+ return False
+
if self.contains(r):
return False
diff --git a/browser_use/dom/serializer/serializer.py b/browser_use/dom/serializer/serializer.py
index 5f70fa57c..4374dc43c 100644
--- a/browser_use/dom/serializer/serializer.py
+++ b/browser_use/dom/serializer/serializer.py
@@ -1175,11 +1175,24 @@ class DOMTreeSerializer:
attributes_to_include['placeholder'] = 'mm/dd/yyyy'
attributes_to_include['format'] = 'mm/dd/yyyy'
+ # Never include values from password fields - they contain secrets that must not
+ # leak into DOM snapshots sent to the LLM, where prompt injection could exfiltrate them.
+ is_password_field = (
+ node.tag_name
+ and node.tag_name.lower() == 'input'
+ and node.attributes
+ and node.attributes.get('type', '').lower() == 'password'
+ )
+
# Include accessibility properties
if node.ax_node and node.ax_node.properties:
+ # Properties that carry field values - must be excluded for password fields
+ value_properties = {'value', 'valuetext'}
for prop in node.ax_node.properties:
try:
if prop.name in include_attributes and prop.value is not None:
+ if is_password_field and prop.name in value_properties:
+ continue
# Convert boolean to lowercase string, keep others as-is
if isinstance(prop.value, bool):
attributes_to_include[prop.name] = str(prop.value).lower()
@@ -1193,8 +1206,10 @@ class DOMTreeSerializer:
# Special handling for form elements - ensure current value is shown
# For text inputs, textareas, and selects, prioritize showing the current value from AX tree
if node.tag_name and node.tag_name.lower() in ['input', 'textarea', 'select']:
+ if is_password_field:
+ attributes_to_include.pop('value', None)
# ALWAYS check AX tree - it reflects actual typed value, DOM attribute may not update
- if node.ax_node and node.ax_node.properties:
+ elif node.ax_node and node.ax_node.properties:
for prop in node.ax_node.properties:
# Try valuetext first (human-readable display value)
if prop.name == 'valuetext' and prop.value:
diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py
index 40826e378..804cc451f 100644
--- a/browser_use/dom/service.py
+++ b/browser_use/dom/service.py
@@ -427,6 +427,10 @@ class DomService:
iframe_scroll_ms = (time.time() - start_iframe_scroll) * 1000
# Detect elements with JavaScript click event listeners (without mutating DOM)
+ # On heavy pages (>10k elements) the querySelectorAll('*') + getEventListeners()
+ # loop plus per-element DOM.describeNode CDP calls can take 10s+.
+ # The JS expression below bails out early if the page is too heavy.
+ # Elements are still detected via the accessibility tree and ClickableElementDetector.
start_js_listener_detection = time.time()
js_click_listener_backend_ids: set[int] = set()
try:
@@ -440,9 +444,15 @@ class DomService:
return null;
}
- const elementsWithListeners = [];
const allElements = document.querySelectorAll('*');
+ // Skip on heavy pages — listener detection is too expensive
+ if (allElements.length > 10000) {
+ return null;
+ }
+
+ const elementsWithListeners = [];
+
for (const el of allElements) {
try {
const listeners = getEventListeners(el);
@@ -936,38 +946,57 @@ class DomService:
# Use pre-fetched all_frames to find the iframe's target (no redundant CDP call)
frame_id = node.get('frameId', None)
+
+ # Fallback: if frameId is missing or not in all_frames, try URL matching via
+ # the src attribute. This handles dynamically-injected iframes (e.g. HubSpot
+ # popups, chat widgets) where Chrome hasn't yet registered the frameId in the
+ # frame tree at DOM-snapshot time.
+ if (not frame_id or frame_id not in all_frames) and attributes:
+ src = attributes.get('src', '')
+ if src:
+ src_base = src.split('?')[0].rstrip('/')
+ for fid, finfo in all_frames.items():
+ frame_url = finfo.get('url', '').split('?')[0].rstrip('/')
+ if frame_url and frame_url == src_base:
+ frame_id = fid
+ self.logger.debug(f'Matched cross-origin iframe by src URL: {src!r} -> frameId={fid}')
+ break
+
+ iframe_document_target = None
if frame_id:
frame_info = all_frames.get(frame_id)
- iframe_document_target = None
if frame_info and frame_info.get('frameTargetId'):
iframe_target_id = frame_info['frameTargetId']
+ # Use frameTargetId directly from all_frames — get_all_frames() already
+ # validated connectivity. Do NOT gate on session_manager.get_target():
+ # there is a race where _target_sessions is set (inside the lock in
+ # _handle_target_attached) before _targets is populated (outside the
+ # lock), so get_target() can transiently return None for a live target.
iframe_target = self.browser_session.session_manager.get_target(iframe_target_id)
- if iframe_target:
- iframe_document_target = {
- 'targetId': iframe_target.target_id,
- 'url': iframe_target.url,
- 'title': iframe_target.title,
- 'type': iframe_target.target_type,
- }
- else:
- iframe_document_target = None
+ iframe_document_target = {
+ 'targetId': iframe_target_id,
+ 'url': iframe_target.url if iframe_target else frame_info.get('url', ''),
+ 'title': iframe_target.title if iframe_target else frame_info.get('title', ''),
+ 'type': iframe_target.target_type if iframe_target else 'iframe',
+ }
+
# if target actually exists in one of the frames, just recursively build the dom tree for it
if iframe_document_target:
self.logger.debug(
f'Getting content document for iframe {node.get("frameId", None)} at depth {iframe_depth + 1}'
)
- content_document, _ = await self.get_dom_tree(
- target_id=iframe_document_target['targetId'],
- all_frames=all_frames,
- # TODO: experiment with this values -> not sure whether the whole cross origin iframe should be ALWAYS included as soon as some part of it is visible or not.
- # Current config: if the cross origin iframe is AT ALL visible, then just include everything inside of it!
- # initial_html_frames=updated_html_frames,
- initial_total_frame_offset=total_frame_offset,
- iframe_depth=iframe_depth + 1,
- )
-
- dom_tree_node.content_document = content_document
- dom_tree_node.content_document.parent_node = dom_tree_node
+ try:
+ content_document, _ = await self.get_dom_tree(
+ target_id=iframe_document_target['targetId'],
+ all_frames=all_frames,
+ # Current config: if the cross origin iframe is AT ALL visible, include everything inside it
+ initial_total_frame_offset=total_frame_offset,
+ iframe_depth=iframe_depth + 1,
+ )
+ dom_tree_node.content_document = content_document
+ dom_tree_node.content_document.parent_node = dom_tree_node
+ except Exception as e:
+ self.logger.debug(f'Failed to get DOM tree for cross-origin iframe {frame_id}: {e}')
return dom_tree_node
@@ -1075,10 +1104,12 @@ class DomService:
pagination_buttons: list[dict[str, str | int | bool]] = []
# Common pagination patterns to look for
+ # `«` and `»` are ambiguous across sites, so treat them only as prev/next
+ # fallback symbols and let word-based first/last signals win
next_patterns = ['next', '>', '»', '→', 'siguiente', 'suivant', 'weiter', 'volgende']
prev_patterns = ['prev', 'previous', '<', '«', '←', 'anterior', 'précédent', 'zurück', 'vorige']
- first_patterns = ['first', '⇤', '«', 'primera', 'première', 'erste', 'eerste']
- last_patterns = ['last', '⇥', '»', 'última', 'dernier', 'letzte', 'laatste']
+ first_patterns = ['first', '⇤', 'primera', 'première', 'erste', 'eerste']
+ last_patterns = ['last', '⇥', 'última', 'dernier', 'letzte', 'laatste']
for index, node in selector_map.items():
# Skip non-clickable elements
@@ -1104,18 +1135,18 @@ class DomService:
button_type: str | None = None
- # Check for next button
- if any(pattern in all_text for pattern in next_patterns):
- button_type = 'next'
- # Check for previous button
- elif any(pattern in all_text for pattern in prev_patterns):
- button_type = 'prev'
- # Check for first button
- elif any(pattern in all_text for pattern in first_patterns):
+ # Match specific first/last semantics before generic prev/next fallbacks.
+ if any(pattern in all_text for pattern in first_patterns):
button_type = 'first'
# Check for last button
elif any(pattern in all_text for pattern in last_patterns):
button_type = 'last'
+ # Check for next button
+ elif any(pattern in all_text for pattern in next_patterns):
+ button_type = 'next'
+ # Check for previous button
+ elif any(pattern in all_text for pattern in prev_patterns):
+ button_type = 'prev'
# Check for numeric page buttons (single or double digit)
elif text.isdigit() and len(text) <= 2 and role in ['button', 'link', '']:
button_type = 'page_number'
diff --git a/browser_use/filesystem/file_system.py b/browser_use/filesystem/file_system.py
index 1e46a47d9..49946a1ea 100644
--- a/browser_use/filesystem/file_system.py
+++ b/browser_use/filesystem/file_system.py
@@ -1,5 +1,7 @@
import asyncio
import base64
+import csv
+import io
import os
import re
import shutil
@@ -164,12 +166,68 @@ class JsonFile(BaseFile):
class CsvFile(BaseFile):
- """CSV file implementation"""
+ """CSV file implementation with automatic RFC 4180 normalization.
+
+ LLMs frequently produce malformed CSV (missing quotes around fields with commas,
+ inconsistent empty fields, unescaped internal quotes). This class parses the raw
+ content through Python's csv module on every write to guarantee well-formed output.
+ """
@property
def extension(self) -> str:
return 'csv'
+ @staticmethod
+ def _normalize_csv(raw: str) -> str:
+ """Parse and re-serialize CSV content to fix quoting, empty fields, and escaping.
+
+ Handles common LLM mistakes: unquoted fields containing commas,
+ unescaped quotes inside fields, inconsistent empty fields,
+ trailing/leading blank lines, and double-escaped JSON output
+ (literal backslash-n and backslash-quote instead of real newlines/quotes).
+ """
+ stripped = raw.strip('\n\r')
+ if not stripped:
+ return raw
+
+ # Detect double-escaped LLM tool call output: if the content has no real
+ # newlines but contains literal \n sequences, the entire string is likely
+ # double-escaped JSON. Unescape \" → " first, then \n → newline.
+ if '\n' not in stripped and '\\n' in stripped:
+ stripped = stripped.replace('\\"', '"')
+ stripped = stripped.replace('\\n', '\n')
+
+ reader = csv.reader(io.StringIO(stripped))
+ rows: list[list[str]] = []
+ for row in reader:
+ # Skip completely empty rows (artifacts of blank lines)
+ if row:
+ rows.append(row)
+
+ if not rows:
+ return raw
+
+ out = io.StringIO()
+ writer = csv.writer(out, lineterminator='\n')
+ writer.writerows(rows)
+ # Strip trailing newline so callers (write_file action) control line endings
+ return out.getvalue().rstrip('\n')
+
+ def write_file_content(self, content: str) -> None:
+ """Normalize CSV content before storing."""
+ self.update_content(self._normalize_csv(content))
+
+ def append_file_content(self, content: str) -> None:
+ """Normalize the appended CSV rows and merge with existing content."""
+ normalized_new = self._normalize_csv(content)
+ if not normalized_new.strip('\n\r'):
+ return
+ existing = self.content
+ if existing and not existing.endswith('\n'):
+ existing += '\n'
+ combined = existing + normalized_new
+ self.update_content(self._normalize_csv(combined))
+
class JsonlFile(BaseFile):
"""JSONL (JSON Lines) file implementation"""
@@ -590,7 +648,7 @@ class FileSystem:
truncation_note = (
f'\n\n[Showing {len(pages_included)} of {num_pages} pages. '
f'Skipped pages: {skipped[:10]}{"..." if len(skipped) > 10 else ""}. '
- f'Use read_long_content with a specific goal to find relevant sections.]'
+ f'Use extract with start_from_char to read further into the file.]'
)
else:
truncation_note = ''
diff --git a/browser_use/init_cmd.py b/browser_use/init_cmd.py
index 9353d0620..3e25a7a15 100644
--- a/browser_use/init_cmd.py
+++ b/browser_use/init_cmd.py
@@ -428,7 +428,7 @@ def main(
next_steps.append('4. Set up your API key in .env file or environment:\n', style='bold')
next_steps.append(' BROWSER_USE_API_KEY=your-key\n', style='dim')
next_steps.append(
- ' (Get your key at https://cloud.browser-use.com/dashboard/settings?tab=api-keys&new)\n\n',
+ ' (Get your key at https://cloud.browser-use.com/dashboard/settings?tab=api-keys&new&utm_source=oss&utm_medium=cli)\n\n',
style='dim italic',
)
next_steps.append('5. Run your script:\n', style='bold')
diff --git a/browser_use/llm/anthropic/chat.py b/browser_use/llm/anthropic/chat.py
index 3e4afebc4..274d72452 100644
--- a/browser_use/llm/anthropic/chat.py
+++ b/browser_use/llm/anthropic/chat.py
@@ -223,15 +223,29 @@ class ChatAnthropic(BaseChatModel):
stop_reason=response.stop_reason,
)
except Exception as e:
- # If validation fails, try to parse it as JSON first
- if isinstance(content_block.input, str):
- data = json.loads(content_block.input)
- return ChatInvokeCompletion(
- completion=output_format.model_validate(data),
- usage=usage,
- stop_reason=response.stop_reason,
- )
- raise e
+ # If validation fails, try to fix common model output issues
+ _input = content_block.input
+ if isinstance(_input, str):
+ _input = json.loads(_input)
+ elif isinstance(_input, dict):
+ # Model sometimes double-serializes fields
+ for key, value in _input.items():
+ if isinstance(value, str) and value.startswith(('[', '{')):
+ try:
+ _input[key] = json.loads(value)
+ except json.JSONDecodeError:
+ cleaned = value.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
+ try:
+ _input[key] = json.loads(cleaned)
+ except json.JSONDecodeError:
+ pass
+ else:
+ raise
+ return ChatInvokeCompletion(
+ completion=output_format.model_validate(_input),
+ usage=usage,
+ stop_reason=response.stop_reason,
+ )
# If no tool use block found, raise an error
raise ValueError('Expected tool use in response but none found')
diff --git a/browser_use/llm/aws/chat_anthropic.py b/browser_use/llm/aws/chat_anthropic.py
index 8515f5660..4d4283a21 100644
--- a/browser_use/llm/aws/chat_anthropic.py
+++ b/browser_use/llm/aws/chat_anthropic.py
@@ -222,14 +222,28 @@ class ChatAnthropicBedrock(ChatAWSBedrock):
try:
return ChatInvokeCompletion(completion=output_format.model_validate(content_block.input), usage=usage)
except Exception as e:
- # If validation fails, try to parse it as JSON first
- if isinstance(content_block.input, str):
- data = json.loads(content_block.input)
- return ChatInvokeCompletion(
- completion=output_format.model_validate(data),
- usage=usage,
- )
- raise e
+ # If validation fails, try to fix common model output issues
+ _input = content_block.input
+ if isinstance(_input, str):
+ _input = json.loads(_input)
+ elif isinstance(_input, dict):
+ # Model sometimes double-serializes fields
+ for key, value in _input.items():
+ if isinstance(value, str) and value.startswith(('[', '{')):
+ try:
+ _input[key] = json.loads(value)
+ except json.JSONDecodeError:
+ cleaned = value.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
+ try:
+ _input[key] = json.loads(cleaned)
+ except json.JSONDecodeError:
+ pass
+ else:
+ raise
+ return ChatInvokeCompletion(
+ completion=output_format.model_validate(_input),
+ usage=usage,
+ )
# If no tool use block found, raise an error
raise ValueError('Expected tool use in response but none found')
diff --git a/browser_use/llm/aws/chat_bedrock.py b/browser_use/llm/aws/chat_bedrock.py
index 610379f90..3796db472 100644
--- a/browser_use/llm/aws/chat_bedrock.py
+++ b/browser_use/llm/aws/chat_bedrock.py
@@ -9,6 +9,7 @@ from browser_use.llm.aws.serializer import AWSBedrockMessageSerializer
from browser_use.llm.base import BaseChatModel
from browser_use.llm.exceptions import ModelProviderError, ModelRateLimitError
from browser_use.llm.messages import BaseMessage
+from browser_use.llm.schema import SchemaOptimizer
from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
if TYPE_CHECKING:
@@ -116,27 +117,14 @@ class ChatAWSBedrock(BaseChatModel):
def _format_tools_for_request(self, output_format: type[BaseModel]) -> list[dict[str, Any]]:
"""Format a Pydantic model as a tool for structured output."""
- schema = output_format.model_json_schema()
-
- # Convert Pydantic schema to Bedrock tool format
- properties = {}
- required = []
-
- for prop_name, prop_info in schema.get('properties', {}).items():
- properties[prop_name] = {
- 'type': prop_info.get('type', 'string'),
- 'description': prop_info.get('description', ''),
- }
-
- # Add required fields
- required = schema.get('required', [])
+ schema = SchemaOptimizer.create_optimized_json_schema(output_format)
return [
{
'toolSpec': {
'name': f'extract_{output_format.__name__.lower()}',
'description': f'Extract information in the format of {output_format.__name__}',
- 'inputSchema': {'json': {'type': 'object', 'properties': properties, 'required': required}},
+ 'inputSchema': {'json': schema},
}
}
]
diff --git a/browser_use/llm/browser_use/chat.py b/browser_use/llm/browser_use/chat.py
index 26b73b12d..0395ffaa5 100644
--- a/browser_use/llm/browser_use/chat.py
+++ b/browser_use/llm/browser_use/chat.py
@@ -90,8 +90,8 @@ class ChatBrowserUse(BaseChatModel):
if not self.api_key:
raise ValueError(
- 'You need to set the BROWSER_USE_API_KEY environment variable. '
- 'Get your key at https://cloud.browser-use.com/new-api-key'
+ 'BROWSER_USE_API_KEY is not set. To use ChatBrowserUse, get a key at:\n'
+ 'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=chat_browser_use'
)
@property
@@ -275,9 +275,17 @@ class ChatBrowserUse(BaseChatModel):
status_code = e.response.status_code
if status_code == 401:
- raise ModelProviderError(message=f'Invalid API key. {error_detail}', status_code=401, model=self.name)
+ raise ModelProviderError(
+ message=f'BROWSER_USE_API_KEY is invalid. Get a new key at:\nhttps://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=chat_browser_use\n{error_detail}',
+ status_code=401,
+ model=self.name,
+ )
elif status_code == 402:
- raise ModelProviderError(message=f'Insufficient credits. {error_detail}', status_code=402, model=self.name)
+ raise ModelProviderError(
+ message=f'Browser Use credits exhausted. Add more at:\nhttps://cloud.browser-use.com/billing?utm_source=oss&utm_medium=chat_browser_use\n{error_detail}',
+ status_code=402,
+ model=self.name,
+ )
elif status_code == 429:
raise ModelRateLimitError(message=f'Rate limit exceeded. {error_detail}', status_code=429, model=self.name)
elif status_code in {500, 502, 503, 504}:
diff --git a/browser_use/llm/google/chat.py b/browser_use/llm/google/chat.py
index 6ffe3c40b..3965266ce 100644
--- a/browser_use/llm/google/chat.py
+++ b/browser_use/llm/google/chat.py
@@ -85,7 +85,7 @@ class ChatGoogle(BaseChatModel):
# Model configuration
model: VerifiedGeminiModels | str
- temperature: float | None = 0.5
+ temperature: float | None = None
top_p: float | None = None
seed: int | None = None
thinking_budget: int | None = None # for Gemini 2.5: -1 for dynamic (default), 0 disables, or token count
@@ -222,6 +222,8 @@ class ChatGoogle(BaseChatModel):
# Apply model-specific configuration (these can override config)
if self.temperature is not None:
config['temperature'] = self.temperature
+ else:
+ config['temperature'] = 1.0 if 'gemini-3' in self.model else 0.5
# Add system instruction if present
if system_instruction:
diff --git a/browser_use/llm/litellm/__init__.py b/browser_use/llm/litellm/__init__.py
new file mode 100644
index 000000000..26d5f1e70
--- /dev/null
+++ b/browser_use/llm/litellm/__init__.py
@@ -0,0 +1,3 @@
+from browser_use.llm.litellm.chat import ChatLiteLLM
+
+__all__ = ['ChatLiteLLM']
diff --git a/browser_use/llm/litellm/chat.py b/browser_use/llm/litellm/chat.py
new file mode 100644
index 000000000..3510fe846
--- /dev/null
+++ b/browser_use/llm/litellm/chat.py
@@ -0,0 +1,227 @@
+"""
+ChatLiteLLM - LiteLLM chat model wrapper.
+
+Requires the `litellm` package to be installed separately:
+ pip install litellm
+
+Note: litellm is NOT included as a dependency of browser-use.
+"""
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any, TypeVar, overload
+
+from pydantic import BaseModel
+
+from browser_use.llm.base import BaseChatModel
+from browser_use.llm.exceptions import ModelProviderError, ModelRateLimitError
+from browser_use.llm.messages import BaseMessage
+from browser_use.llm.schema import SchemaOptimizer
+from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
+
+from .serializer import LiteLLMMessageSerializer
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar('T', bound=BaseModel)
+
+
+@dataclass
+class ChatLiteLLM(BaseChatModel):
+ model: str
+ api_key: str | None = None
+ api_base: str | None = None
+ temperature: float | None = 0.0
+ max_tokens: int | None = 4096
+ max_retries: int = 3
+ metadata: dict[str, Any] | None = None
+
+ _provider_name: str = field(default='', init=False, repr=False)
+ _clean_model: str = field(default='', init=False, repr=False)
+
+ def __post_init__(self) -> None:
+ """Resolve provider info from the model string via litellm."""
+ try:
+ from litellm import get_llm_provider # type: ignore[reportMissingImports]
+
+ self._clean_model, self._provider_name, _, _ = get_llm_provider(self.model)
+ except Exception:
+ if '/' in self.model:
+ self._provider_name, self._clean_model = self.model.split('/', 1)
+ else:
+ self._provider_name = 'openai'
+ self._clean_model = self.model
+
+ logger.debug(
+ 'ChatLiteLLM initialized: model=%s, provider=%s, clean=%s, api_base=%s',
+ self.model,
+ self._provider_name,
+ self._clean_model,
+ self.api_base or '(default)',
+ )
+
+ @property
+ def provider(self) -> str:
+ return self._provider_name or 'litellm'
+
+ @property
+ def name(self) -> str:
+ return self._clean_model or self.model
+
+ @staticmethod
+ def _parse_usage(response: Any) -> ChatInvokeUsage | None:
+ """Extract token usage from a litellm response."""
+ usage = getattr(response, 'usage', None)
+ if usage is None:
+ return None
+
+ prompt_tokens = getattr(usage, 'prompt_tokens', 0) or 0
+ completion_tokens = getattr(usage, 'completion_tokens', 0) or 0
+
+ prompt_cached = getattr(usage, 'cache_read_input_tokens', None)
+ cache_creation = getattr(usage, 'cache_creation_input_tokens', None)
+
+ if prompt_cached is None:
+ details = getattr(usage, 'prompt_tokens_details', None)
+ if details:
+ prompt_cached = getattr(details, 'cached_tokens', None)
+
+ return ChatInvokeUsage(
+ prompt_tokens=prompt_tokens,
+ prompt_cached_tokens=int(prompt_cached) if prompt_cached is not None else None,
+ prompt_cache_creation_tokens=int(cache_creation) if cache_creation is not None else None,
+ prompt_image_tokens=None,
+ completion_tokens=completion_tokens,
+ total_tokens=prompt_tokens + completion_tokens,
+ )
+
+ @overload
+ async def ainvoke(
+ self,
+ messages: list[BaseMessage],
+ output_format: None = None,
+ **kwargs: Any,
+ ) -> ChatInvokeCompletion[str]: ...
+
+ @overload
+ async def ainvoke(
+ self,
+ messages: list[BaseMessage],
+ output_format: type[T],
+ **kwargs: Any,
+ ) -> ChatInvokeCompletion[T]: ...
+
+ async def ainvoke(
+ self,
+ messages: list[BaseMessage],
+ output_format: type[T] | None = None,
+ **kwargs: Any,
+ ) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]:
+ from litellm import acompletion # type: ignore[reportMissingImports]
+ from litellm.exceptions import APIConnectionError, APIError, RateLimitError, Timeout # type: ignore[reportMissingImports]
+ from litellm.types.utils import ModelResponse # type: ignore[reportMissingImports]
+
+ litellm_messages = LiteLLMMessageSerializer.serialize(messages)
+
+ params: dict[str, Any] = {
+ 'model': self.model,
+ 'messages': litellm_messages,
+ 'num_retries': self.max_retries,
+ }
+
+ if self.temperature is not None:
+ params['temperature'] = self.temperature
+ if self.max_tokens is not None:
+ params['max_tokens'] = self.max_tokens
+ if self.api_key:
+ params['api_key'] = self.api_key
+ if self.api_base:
+ params['api_base'] = self.api_base
+ if self.metadata:
+ params['metadata'] = self.metadata
+
+ if output_format is not None:
+ schema = SchemaOptimizer.create_optimized_json_schema(output_format)
+ params['response_format'] = {
+ 'type': 'json_schema',
+ 'json_schema': {
+ 'name': 'agent_output',
+ 'strict': True,
+ 'schema': schema,
+ },
+ }
+
+ try:
+ raw_response = await acompletion(**params)
+ except RateLimitError as e:
+ raise ModelRateLimitError(
+ message=str(e),
+ model=self.name,
+ ) from e
+ except Timeout as e:
+ raise ModelProviderError(
+ message=f'Request timed out: {e}',
+ model=self.name,
+ ) from e
+ except APIConnectionError as e:
+ raise ModelProviderError(
+ message=str(e),
+ model=self.name,
+ ) from e
+ except APIError as e:
+ status = getattr(e, 'status_code', 502) or 502
+ raise ModelProviderError(
+ message=str(e),
+ status_code=status,
+ model=self.name,
+ ) from e
+ except ModelProviderError:
+ raise
+ except Exception as e:
+ raise ModelProviderError(
+ message=str(e),
+ model=self.name,
+ ) from e
+
+ assert isinstance(raw_response, ModelResponse), f'Expected ModelResponse, got {type(raw_response)}'
+ response: ModelResponse = raw_response
+
+ choice = response.choices[0] if response.choices else None
+ if choice is None:
+ raise ModelProviderError(
+ message='Empty response: no choices returned by the model',
+ status_code=502,
+ model=self.name,
+ )
+
+ content = choice.message.content or ''
+ usage = self._parse_usage(response)
+ stop_reason = choice.finish_reason
+
+ thinking: str | None = None
+ msg_obj = choice.message
+ reasoning = getattr(msg_obj, 'reasoning_content', None)
+ if reasoning:
+ thinking = str(reasoning)
+
+ if output_format is not None:
+ if not content:
+ raise ModelProviderError(
+ message='Model returned empty content for structured output request',
+ status_code=500,
+ model=self.name,
+ )
+ parsed = output_format.model_validate_json(content)
+ return ChatInvokeCompletion(
+ completion=parsed,
+ thinking=thinking,
+ usage=usage,
+ stop_reason=stop_reason,
+ )
+
+ return ChatInvokeCompletion(
+ completion=content,
+ thinking=thinking,
+ usage=usage,
+ stop_reason=stop_reason,
+ )
diff --git a/browser_use/llm/litellm/serializer.py b/browser_use/llm/litellm/serializer.py
new file mode 100644
index 000000000..6ac90f557
--- /dev/null
+++ b/browser_use/llm/litellm/serializer.py
@@ -0,0 +1,120 @@
+from typing import Any
+
+from browser_use.llm.messages import (
+ AssistantMessage,
+ BaseMessage,
+ ContentPartImageParam,
+ ContentPartTextParam,
+ SystemMessage,
+ UserMessage,
+)
+
+
+class LiteLLMMessageSerializer:
+ @staticmethod
+ def _serialize_user_content(
+ content: str | list[ContentPartTextParam | ContentPartImageParam],
+ ) -> str | list[dict[str, Any]]:
+ if isinstance(content, str):
+ return content
+
+ parts: list[dict[str, Any]] = []
+ for part in content:
+ if part.type == 'text':
+ parts.append(
+ {
+ 'type': 'text',
+ 'text': part.text,
+ }
+ )
+ elif part.type == 'image_url':
+ parts.append(
+ {
+ 'type': 'image_url',
+ 'image_url': {
+ 'url': part.image_url.url,
+ 'detail': part.image_url.detail,
+ },
+ }
+ )
+ return parts
+
+ @staticmethod
+ def _serialize_system_content(
+ content: str | list[ContentPartTextParam],
+ ) -> str | list[dict[str, Any]]:
+ if isinstance(content, str):
+ return content
+
+ return [
+ {
+ 'type': 'text',
+ 'text': p.text,
+ }
+ for p in content
+ ]
+
+ @staticmethod
+ def _serialize_assistant_content(
+ content: str | list[Any] | None,
+ ) -> str | list[dict[str, Any]] | None:
+ if content is None:
+ return None
+ if isinstance(content, str):
+ return content
+
+ parts = []
+ for part in content:
+ if part.type == 'text':
+ parts.append(
+ {
+ 'type': 'text',
+ 'text': part.text,
+ }
+ )
+ elif part.type == 'refusal':
+ parts.append(
+ {
+ 'type': 'text',
+ 'text': f'[Refusal] {part.refusal}',
+ }
+ )
+ return parts
+
+ @staticmethod
+ def serialize(messages: list[BaseMessage]) -> list[dict[str, Any]]:
+ result: list[dict[str, Any]] = []
+ for msg in messages:
+ if isinstance(msg, UserMessage):
+ d: dict[str, Any] = {'role': 'user'}
+ d['content'] = LiteLLMMessageSerializer._serialize_user_content(msg.content)
+ if msg.name is not None:
+ d['name'] = msg.name
+ result.append(d)
+
+ elif isinstance(msg, SystemMessage):
+ d = {'role': 'system'}
+ d['content'] = LiteLLMMessageSerializer._serialize_system_content(msg.content)
+ if msg.name is not None:
+ d['name'] = msg.name
+ result.append(d)
+
+ elif isinstance(msg, AssistantMessage):
+ d = {'role': 'assistant'}
+ d['content'] = LiteLLMMessageSerializer._serialize_assistant_content(msg.content)
+ if msg.name is not None:
+ d['name'] = msg.name
+ if msg.tool_calls:
+ d['tool_calls'] = [
+ {
+ 'id': tc.id,
+ 'type': 'function',
+ 'function': {
+ 'name': tc.function.name,
+ 'arguments': tc.function.arguments,
+ },
+ }
+ for tc in msg.tool_calls
+ ]
+ result.append(d)
+ return result
diff --git a/browser_use/llm/vercel/chat.py b/browser_use/llm/vercel/chat.py
index 5ae0cd0dc..f4037b907 100644
--- a/browser_use/llm/vercel/chat.py
+++ b/browser_use/llm/vercel/chat.py
@@ -1,4 +1,5 @@
import json
+import os
from collections.abc import Mapping
from dataclasses import dataclass, field
from typing import Any, Literal, TypeAlias, TypeVar, overload
@@ -26,15 +27,30 @@ ChatVercelModel: TypeAlias = Literal[
'alibaba/qwen-3-235b',
'alibaba/qwen-3-30b',
'alibaba/qwen-3-32b',
+ 'alibaba/qwen3-235b-a22b-thinking',
'alibaba/qwen3-coder',
'alibaba/qwen3-coder-30b-a3b',
+ 'alibaba/qwen3-coder-next',
'alibaba/qwen3-coder-plus',
+ 'alibaba/qwen3-embedding-0.6b',
+ 'alibaba/qwen3-embedding-4b',
+ 'alibaba/qwen3-embedding-8b',
'alibaba/qwen3-max',
'alibaba/qwen3-max-preview',
+ 'alibaba/qwen3-max-thinking',
'alibaba/qwen3-next-80b-a3b-instruct',
'alibaba/qwen3-next-80b-a3b-thinking',
'alibaba/qwen3-vl-instruct',
'alibaba/qwen3-vl-thinking',
+ 'alibaba/qwen3.5-flash',
+ 'alibaba/qwen3.5-plus',
+ 'alibaba/wan-v2.5-t2v-preview',
+ 'alibaba/wan-v2.6-i2v',
+ 'alibaba/wan-v2.6-i2v-flash',
+ 'alibaba/wan-v2.6-r2v',
+ 'alibaba/wan-v2.6-r2v-flash',
+ 'alibaba/wan-v2.6-t2v',
+ 'amazon/nova-2-lite',
'amazon/nova-lite',
'amazon/nova-micro',
'amazon/nova-pro',
@@ -48,38 +64,69 @@ ChatVercelModel: TypeAlias = Literal[
'anthropic/claude-haiku-4.5',
'anthropic/claude-opus-4',
'anthropic/claude-opus-4.1',
+ 'anthropic/claude-opus-4.5',
+ 'anthropic/claude-opus-4.6',
'anthropic/claude-sonnet-4',
'anthropic/claude-sonnet-4.5',
+ 'anthropic/claude-sonnet-4.6',
+ 'arcee-ai/trinity-large-preview',
+ 'arcee-ai/trinity-mini',
+ 'bfl/flux-kontext-max',
+ 'bfl/flux-kontext-pro',
+ 'bfl/flux-pro-1.0-fill',
+ 'bfl/flux-pro-1.1',
+ 'bfl/flux-pro-1.1-ultra',
+ 'bytedance/seed-1.6',
+ 'bytedance/seed-1.8',
+ 'bytedance/seedance-v1.0-lite-i2v',
+ 'bytedance/seedance-v1.0-lite-t2v',
+ 'bytedance/seedance-v1.0-pro',
+ 'bytedance/seedance-v1.0-pro-fast',
+ 'bytedance/seedance-v1.5-pro',
'cohere/command-a',
- 'cohere/command-r',
- 'cohere/command-r-plus',
'cohere/embed-v4.0',
'deepseek/deepseek-r1',
- 'deepseek/deepseek-r1-distill-llama-70b',
'deepseek/deepseek-v3',
'deepseek/deepseek-v3.1',
- 'deepseek/deepseek-v3.1-base',
'deepseek/deepseek-v3.1-terminus',
- 'deepseek/deepseek-v3.2-exp',
- 'deepseek/deepseek-v3.2-exp-thinking',
+ 'deepseek/deepseek-v3.2',
+ 'deepseek/deepseek-v3.2-thinking',
'google/gemini-2.0-flash',
'google/gemini-2.0-flash-lite',
'google/gemini-2.5-flash',
'google/gemini-2.5-flash-image',
- 'google/gemini-2.5-flash-image-preview',
'google/gemini-2.5-flash-lite',
'google/gemini-2.5-flash-lite-preview-09-2025',
'google/gemini-2.5-flash-preview-09-2025',
'google/gemini-2.5-pro',
+ 'google/gemini-3-flash',
+ 'google/gemini-3-pro-image',
+ 'google/gemini-3-pro-preview',
+ 'google/gemini-3.1-flash-image-preview',
+ 'google/gemini-3.1-flash-lite-preview',
+ 'google/gemini-3.1-pro-preview',
'google/gemini-embedding-001',
- 'google/gemma-2-9b',
+ 'google/imagen-4.0-fast-generate-001',
+ 'google/imagen-4.0-generate-001',
+ 'google/imagen-4.0-ultra-generate-001',
'google/text-embedding-005',
'google/text-multilingual-embedding-002',
+ 'google/veo-3.0-fast-generate-001',
+ 'google/veo-3.0-generate-001',
+ 'google/veo-3.1-fast-generate-001',
+ 'google/veo-3.1-generate-001',
+ 'inception/mercury-2',
'inception/mercury-coder-small',
+ 'klingai/kling-v2.5-turbo-i2v',
+ 'klingai/kling-v2.5-turbo-t2v',
+ 'klingai/kling-v2.6-i2v',
+ 'klingai/kling-v2.6-motion-control',
+ 'klingai/kling-v2.6-t2v',
+ 'klingai/kling-v3.0-i2v',
+ 'klingai/kling-v3.0-t2v',
+ 'kwaipilot/kat-coder-pro-v1',
'meituan/longcat-flash-chat',
'meituan/longcat-flash-thinking',
- 'meta/llama-3-70b',
- 'meta/llama-3-8b',
'meta/llama-3.1-70b',
'meta/llama-3.1-8b',
'meta/llama-3.2-11b',
@@ -89,27 +136,40 @@ ChatVercelModel: TypeAlias = Literal[
'meta/llama-3.3-70b',
'meta/llama-4-maverick',
'meta/llama-4-scout',
+ 'minimax/minimax-m2',
+ 'minimax/minimax-m2.1',
+ 'minimax/minimax-m2.1-lightning',
+ 'minimax/minimax-m2.5',
+ 'minimax/minimax-m2.5-highspeed',
'mistral/codestral',
'mistral/codestral-embed',
+ 'mistral/devstral-2',
'mistral/devstral-small',
+ 'mistral/devstral-small-2',
'mistral/magistral-medium',
- 'mistral/magistral-medium-2506',
'mistral/magistral-small',
- 'mistral/magistral-small-2506',
+ 'mistral/ministral-14b',
'mistral/ministral-3b',
'mistral/ministral-8b',
'mistral/mistral-embed',
- 'mistral/mistral-large',
+ 'mistral/mistral-large-3',
'mistral/mistral-medium',
+ 'mistral/mistral-nemo',
'mistral/mistral-small',
'mistral/mixtral-8x22b-instruct',
'mistral/pixtral-12b',
'mistral/pixtral-large',
'moonshotai/kimi-k2',
'moonshotai/kimi-k2-0905',
+ 'moonshotai/kimi-k2-thinking',
+ 'moonshotai/kimi-k2-thinking-turbo',
'moonshotai/kimi-k2-turbo',
+ 'moonshotai/kimi-k2.5',
'morph/morph-v3-fast',
'morph/morph-v3-large',
+ 'nvidia/nemotron-3-nano-30b-a3b',
+ 'nvidia/nemotron-nano-12b-v2-vl',
+ 'nvidia/nemotron-nano-9b-v2',
'openai/gpt-3.5-turbo',
'openai/gpt-3.5-turbo-instruct',
'openai/gpt-4-turbo',
@@ -118,16 +178,37 @@ ChatVercelModel: TypeAlias = Literal[
'openai/gpt-4.1-nano',
'openai/gpt-4o',
'openai/gpt-4o-mini',
+ 'openai/gpt-4o-mini-search-preview',
'openai/gpt-5',
+ 'openai/gpt-5-chat',
'openai/gpt-5-codex',
'openai/gpt-5-mini',
'openai/gpt-5-nano',
'openai/gpt-5-pro',
+ 'openai/gpt-5.1-codex',
+ 'openai/gpt-5.1-codex-max',
+ 'openai/gpt-5.1-codex-mini',
+ 'openai/gpt-5.1-instant',
+ 'openai/gpt-5.1-thinking',
+ 'openai/gpt-5.2',
+ 'openai/gpt-5.2-chat',
+ 'openai/gpt-5.2-codex',
+ 'openai/gpt-5.2-pro',
+ 'openai/gpt-5.3-chat',
+ 'openai/gpt-5.3-codex',
+ 'openai/gpt-5.4',
+ 'openai/gpt-5.4-pro',
+ 'openai/gpt-image-1',
+ 'openai/gpt-image-1-mini',
+ 'openai/gpt-image-1.5',
'openai/gpt-oss-120b',
'openai/gpt-oss-20b',
+ 'openai/gpt-oss-safeguard-20b',
'openai/o1',
'openai/o3',
+ 'openai/o3-deep-research',
'openai/o3-mini',
+ 'openai/o3-pro',
'openai/o4-mini',
'openai/text-embedding-3-large',
'openai/text-embedding-3-small',
@@ -136,6 +217,11 @@ ChatVercelModel: TypeAlias = Literal[
'perplexity/sonar-pro',
'perplexity/sonar-reasoning',
'perplexity/sonar-reasoning-pro',
+ 'prime-intellect/intellect-3',
+ 'recraft/recraft-v2',
+ 'recraft/recraft-v3',
+ 'recraft/recraft-v4',
+ 'recraft/recraft-v4-pro',
'stealth/sonoma-dusk-alpha',
'stealth/sonoma-sky-alpha',
'vercel/v0-1.0-md',
@@ -143,11 +229,13 @@ ChatVercelModel: TypeAlias = Literal[
'voyage/voyage-3-large',
'voyage/voyage-3.5',
'voyage/voyage-3.5-lite',
+ 'voyage/voyage-4',
+ 'voyage/voyage-4-large',
+ 'voyage/voyage-4-lite',
'voyage/voyage-code-2',
'voyage/voyage-code-3',
'voyage/voyage-finance-2',
'voyage/voyage-law-2',
- 'xai/grok-2',
'xai/grok-2-vision',
'xai/grok-3',
'xai/grok-3-fast',
@@ -156,11 +244,25 @@ ChatVercelModel: TypeAlias = Literal[
'xai/grok-4',
'xai/grok-4-fast-non-reasoning',
'xai/grok-4-fast-reasoning',
+ 'xai/grok-4.1-fast-non-reasoning',
+ 'xai/grok-4.1-fast-reasoning',
+ 'xai/grok-4.20-multi-agent-beta',
+ 'xai/grok-4.20-non-reasoning-beta',
+ 'xai/grok-4.20-reasoning-beta',
'xai/grok-code-fast-1',
+ 'xai/grok-imagine-image',
+ 'xai/grok-imagine-image-pro',
+ 'xai/grok-imagine-video',
+ 'xiaomi/mimo-v2-flash',
'zai/glm-4.5',
'zai/glm-4.5-air',
'zai/glm-4.5v',
'zai/glm-4.6',
+ 'zai/glm-4.6v',
+ 'zai/glm-4.6v-flash',
+ 'zai/glm-4.7',
+ 'zai/glm-4.7-flashx',
+ 'zai/glm-5',
]
@@ -181,7 +283,8 @@ class ChatVercel(BaseChatModel):
Args:
model: The model identifier
- api_key: Your Vercel API key
+ api_key: Your Vercel AI Gateway API key. If not provided, falls back to
+ AI_GATEWAY_API_KEY or VERCEL_OIDC_TOKEN environment variables.
base_url: The Vercel AI Gateway endpoint (defaults to https://ai-gateway.vercel.sh/v1)
temperature: Sampling temperature (0-2)
max_tokens: Maximum tokens to generate
@@ -191,6 +294,14 @@ class ChatVercel(BaseChatModel):
max_retries: Maximum number of retries for failed requests
provider_options: Provider routing options for the gateway. Use this to control which
providers are used and in what order. Example: {'gateway': {'order': ['vertex', 'anthropic']}}
+ reasoning: Optional provider-specific reasoning configuration. Merged into
+ providerOptions under the appropriate provider key. Example for Anthropic:
+ {'anthropic': {'thinking': {'type': 'adaptive'}}}. Example for OpenAI:
+ {'openai': {'reasoningEffort': 'high', 'reasoningSummary': 'detailed'}}.
+ model_fallbacks: Optional list of fallback model IDs tried in order if the primary
+ model fails. Passed as providerOptions.gateway.models.
+ caching: Optional caching mode for the gateway. Currently supports 'auto', which
+ enables provider-specific prompt caching via providerOptions.gateway.caching.
"""
# Model configuration
@@ -206,8 +317,11 @@ class ChatVercel(BaseChatModel):
'o3',
'o4',
'gpt-oss',
+ 'gpt-5.2-pro',
+ 'gpt-5.4-pro',
'deepseek-r1',
- 'qwen3-next-80b-a3b-thinking',
+ '-thinking',
+ 'perplexity/sonar-reasoning',
]
)
@@ -221,6 +335,9 @@ class ChatVercel(BaseChatModel):
http_client: httpx.AsyncClient | None = None
_strict_response_validation: bool = False
provider_options: dict[str, Any] | None = None
+ reasoning: dict[str, dict[str, Any]] | None = None
+ model_fallbacks: list[str] | None = None
+ caching: Literal['auto'] | None = None
# Static
@property
@@ -229,8 +346,10 @@ class ChatVercel(BaseChatModel):
def _get_client_params(self) -> dict[str, Any]:
"""Prepare client parameters dictionary."""
+ api_key = self.api_key or os.getenv('AI_GATEWAY_API_KEY') or os.getenv('VERCEL_OIDC_TOKEN')
+
base_params = {
- 'api_key': self.api_key,
+ 'api_key': api_key,
'base_url': self.base_url,
'timeout': self.timeout,
'max_retries': self.max_retries,
@@ -387,8 +506,36 @@ class ChatVercel(BaseChatModel):
model_params['max_tokens'] = self.max_tokens
if self.top_p is not None:
model_params['top_p'] = self.top_p
+
+ extra_body: dict[str, Any] = {}
+
+ provider_opts: dict[str, Any] = {}
if self.provider_options:
- model_params['extra_body'] = {'providerOptions': self.provider_options}
+ provider_opts.update(self.provider_options)
+
+ if self.reasoning:
+ # Merge provider-specific reasoning options (ex: {'anthropic': {'thinking': ...}})
+ for provider_name, opts in self.reasoning.items():
+ existing = provider_opts.get(provider_name, {})
+ existing.update(opts)
+ provider_opts[provider_name] = existing
+
+ gateway_opts: dict[str, Any] = provider_opts.get('gateway', {})
+
+ if self.model_fallbacks:
+ gateway_opts['models'] = self.model_fallbacks
+
+ if self.caching:
+ gateway_opts['caching'] = self.caching
+
+ if gateway_opts:
+ provider_opts['gateway'] = gateway_opts
+
+ if provider_opts:
+ extra_body['providerOptions'] = provider_opts
+
+ if extra_body:
+ model_params['extra_body'] = extra_body
if output_format is None:
# Return string response
@@ -439,14 +586,10 @@ class ChatVercel(BaseChatModel):
vercel_messages = VercelMessageSerializer.serialize_messages(modified_messages)
- request_params = model_params.copy()
- if self.provider_options:
- request_params['extra_body'] = {'providerOptions': self.provider_options}
-
response = await self.get_client().chat.completions.create(
model=self.model,
messages=vercel_messages,
- **request_params,
+ **model_params,
)
content = response.choices[0].message.content if response.choices else None
@@ -491,10 +634,6 @@ class ChatVercel(BaseChatModel):
'schema': schema,
}
- request_params = model_params.copy()
- if self.provider_options:
- request_params['extra_body'] = {'providerOptions': self.provider_options}
-
response = await self.get_client().chat.completions.create(
model=self.model,
messages=vercel_messages,
@@ -502,7 +641,7 @@ class ChatVercel(BaseChatModel):
json_schema=response_format_schema,
type='json_schema',
),
- **request_params,
+ **model_params,
)
content = response.choices[0].message.content if response.choices else None
diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py
index 950d0479f..3ef16bc14 100644
--- a/browser_use/logging_config.py
+++ b/browser_use/logging_config.py
@@ -223,9 +223,7 @@ def setup_logging(stream=None, log_level=None, force_setup=False, debug_log_file
'trafilatura.htmlprocessing',
'trafilatura',
'groq',
- 'portalocker',
'google_genai',
- 'portalocker.utils',
'websockets', # General websockets (but not websockets.client which we need)
]
for logger_name in third_party_loggers:
diff --git a/browser_use/mcp/client.py b/browser_use/mcp/client.py
index b53e8a75f..46db1a15e 100644
--- a/browser_use/mcp/client.py
+++ b/browser_use/mcp/client.py
@@ -329,6 +329,7 @@ class MCPClient:
return ActionResult(
extracted_content=extracted_content,
long_term_memory=f"Used MCP tool '{tool.name}' from {self.server_name}",
+ include_extracted_content_only_once=True,
)
except Exception as e:
@@ -372,6 +373,7 @@ class MCPClient:
return ActionResult(
extracted_content=extracted_content,
long_term_memory=f"Used MCP tool '{tool.name}' from {self.server_name}",
+ include_extracted_content_only_once=True,
)
except Exception as e:
diff --git a/browser_use/mcp/server.py b/browser_use/mcp/server.py
index 053e7efa8..9c9140e2d 100644
--- a/browser_use/mcp/server.py
+++ b/browser_use/mcp/server.py
@@ -232,13 +232,21 @@ class BrowserUseServer:
),
types.Tool(
name='browser_click',
- description='Click an element on the page by its index',
+ description='Click an element by index or at specific viewport coordinates. Use index for elements from browser_get_state, or coordinate_x/coordinate_y for pixel-precise clicking.',
inputSchema={
'type': 'object',
'properties': {
'index': {
'type': 'integer',
- 'description': 'The index of the link or element to click (from browser_get_state)',
+ 'description': 'The index of the element to click (from browser_get_state). Provide this OR coordinate_x+coordinate_y.',
+ },
+ 'coordinate_x': {
+ 'type': 'integer',
+ 'description': 'X coordinate in pixels from the left edge of the viewport. Must be used together with coordinate_y. Provide this OR index.',
+ },
+ 'coordinate_y': {
+ 'type': 'integer',
+ 'description': 'Y coordinate in pixels from the top edge of the viewport. Must be used together with coordinate_x. Provide this OR index.',
},
'new_tab': {
'type': 'boolean',
@@ -246,12 +254,11 @@ class BrowserUseServer:
'default': False,
},
},
- 'required': ['index'],
},
),
types.Tool(
name='browser_type',
- description='Type text into an input field',
+ description='Type text into an input field. Clears existing text by default; pass text="" to clear only.',
inputSchema={
'type': 'object',
'properties': {
@@ -259,7 +266,10 @@ class BrowserUseServer:
'type': 'integer',
'description': 'The index of the input element (from browser_get_state)',
},
- 'text': {'type': 'string', 'description': 'The text to type'},
+ 'text': {
+ 'type': 'string',
+ 'description': 'The text to type. Pass an empty string ("") to clear the field without typing.',
+ },
},
'required': ['index', 'text'],
},
@@ -294,6 +304,33 @@ class BrowserUseServer:
'required': ['query'],
},
),
+ types.Tool(
+ name='browser_get_html',
+ description='Get the raw HTML of the current page or a specific element by CSS selector',
+ inputSchema={
+ 'type': 'object',
+ 'properties': {
+ 'selector': {
+ 'type': 'string',
+ 'description': 'Optional CSS selector to get HTML of a specific element. If omitted, returns full page HTML.',
+ },
+ },
+ },
+ ),
+ types.Tool(
+ name='browser_screenshot',
+ description='Take a screenshot of the current page. Returns viewport metadata as text and the screenshot as an image.',
+ inputSchema={
+ 'type': 'object',
+ 'properties': {
+ 'full_page': {
+ 'type': 'boolean',
+ 'description': 'Whether to capture the full scrollable page or just the visible viewport',
+ 'default': False,
+ },
+ },
+ },
+ ),
types.Tool(
name='browser_scroll',
description='Scroll the page',
@@ -361,8 +398,7 @@ class BrowserUseServer:
},
'model': {
'type': 'string',
- 'description': 'LLM model to use (e.g., gpt-4o, claude-3-opus-20240229)',
- 'default': 'gpt-4o',
+ 'description': 'LLM model to use (e.g., gpt-4o, claude-3-opus-20240229). Defaults to the configured model.',
},
'allowed_domains': {
'type': 'array',
@@ -417,12 +453,14 @@ class BrowserUseServer:
return []
@self.server.call_tool()
- async def handle_call_tool(name: str, arguments: dict[str, Any] | None) -> list[types.TextContent]:
+ async def handle_call_tool(name: str, arguments: dict[str, Any] | None) -> list[types.TextContent | types.ImageContent]:
"""Handle tool execution."""
start_time = time.time()
error_msg = None
try:
result = await self._execute_tool(name, arguments or {})
+ if isinstance(result, list):
+ return result
return [types.TextContent(type='text', text=result)]
except Exception as e:
error_msg = str(e)
@@ -441,15 +479,17 @@ class BrowserUseServer:
)
)
- async def _execute_tool(self, tool_name: str, arguments: dict[str, Any]) -> str:
- """Execute a browser-use tool."""
+ async def _execute_tool(
+ self, tool_name: str, arguments: dict[str, Any]
+ ) -> str | list[types.TextContent | types.ImageContent]:
+ """Execute a browser-use tool. Returns str for most tools, or a content list for tools with image output."""
# Agent-based tools
if tool_name == 'retry_with_browser_use_agent':
return await self._retry_with_browser_use_agent(
task=arguments['task'],
max_steps=arguments.get('max_steps', 100),
- model=arguments.get('model', 'gpt-4o'),
+ model=arguments.get('model'),
allowed_domains=arguments.get('allowed_domains', []),
use_vision=arguments.get('use_vision', True),
)
@@ -474,13 +514,32 @@ class BrowserUseServer:
return await self._navigate(arguments['url'], arguments.get('new_tab', False))
elif tool_name == 'browser_click':
- return await self._click(arguments['index'], arguments.get('new_tab', False))
+ return await self._click(
+ index=arguments.get('index'),
+ coordinate_x=arguments.get('coordinate_x'),
+ coordinate_y=arguments.get('coordinate_y'),
+ new_tab=arguments.get('new_tab', False),
+ )
elif tool_name == 'browser_type':
return await self._type_text(arguments['index'], arguments['text'])
elif tool_name == 'browser_get_state':
- return await self._get_browser_state(arguments.get('include_screenshot', False))
+ state_json, screenshot_b64 = await self._get_browser_state(arguments.get('include_screenshot', False))
+ content: list[types.TextContent | types.ImageContent] = [types.TextContent(type='text', text=state_json)]
+ if screenshot_b64:
+ content.append(types.ImageContent(type='image', data=screenshot_b64, mimeType='image/png'))
+ return content
+
+ elif tool_name == 'browser_get_html':
+ return await self._get_html(arguments.get('selector'))
+
+ elif tool_name == 'browser_screenshot':
+ meta_json, screenshot_b64 = await self._screenshot(arguments.get('full_page', False))
+ content: list[types.TextContent | types.ImageContent] = [types.TextContent(type='text', text=meta_json)]
+ if screenshot_b64:
+ content.append(types.ImageContent(type='image', data=screenshot_b64, mimeType='image/png'))
+ return content
elif tool_name == 'browser_extract_content':
return await self._extract_content(arguments['query'], arguments.get('extract_links', False))
@@ -575,7 +634,7 @@ class BrowserUseServer:
self,
task: str,
max_steps: int = 100,
- model: str = 'gpt-4o',
+ model: str | None = None,
allowed_domains: list[str] | None = None,
use_vision: bool = True,
) -> str:
@@ -588,27 +647,25 @@ class BrowserUseServer:
# Get LLM provider
model_provider = llm_config.get('model_provider') or os.getenv('MODEL_PROVIDER')
- # 如果model_provider不等于空,且等Bedrock
+ # Get Bedrock-specific config
if model_provider and model_provider.lower() == 'bedrock':
llm_model = llm_config.get('model') or os.getenv('MODEL') or 'us.anthropic.claude-sonnet-4-20250514-v1:0'
aws_region = llm_config.get('region') or os.getenv('REGION')
if not aws_region:
aws_region = 'us-east-1'
+ aws_sso_auth = llm_config.get('aws_sso_auth', False)
llm = ChatAWSBedrock(
model=llm_model, # or any Bedrock model
aws_region=aws_region,
- aws_sso_auth=True,
+ aws_sso_auth=aws_sso_auth,
)
else:
api_key = llm_config.get('api_key') or os.getenv('OPENAI_API_KEY')
if not api_key:
return 'Error: OPENAI_API_KEY not set in config or environment'
- # Override model if provided in tool call
- if model != llm_config.get('model', 'gpt-4o'):
- llm_model = model
- else:
- llm_model = llm_config.get('model', 'gpt-4o')
+ # Use explicit model from tool call, otherwise fall back to configured default
+ llm_model = model or llm_config.get('model', 'gpt-4o')
base_url = llm_config.get('base_url', None)
kwargs = {}
@@ -693,14 +750,34 @@ class BrowserUseServer:
await event
return f'Navigated to: {url}'
- async def _click(self, index: int, new_tab: bool = False) -> str:
- """Click an element by index."""
+ async def _click(
+ self,
+ index: int | None = None,
+ coordinate_x: int | None = None,
+ coordinate_y: int | None = None,
+ new_tab: bool = False,
+ ) -> str:
+ """Click an element by index or at viewport coordinates."""
if not self.browser_session:
return 'Error: No browser session active'
# Update session activity
self._update_session_activity(self.browser_session.id)
+ # Coordinate-based clicking
+ if coordinate_x is not None and coordinate_y is not None:
+ from browser_use.browser.events import ClickCoordinateEvent
+
+ event = self.browser_session.event_bus.dispatch(
+ ClickCoordinateEvent(coordinate_x=coordinate_x, coordinate_y=coordinate_y)
+ )
+ await event
+ return f'Clicked at coordinates ({coordinate_x}, {coordinate_y})'
+
+ # Index-based clicking
+ if index is None:
+ return 'Error: Provide either index or both coordinate_x and coordinate_y'
+
# Get the element
element = await self.browser_session.get_dom_element_by_index(index)
if not element:
@@ -730,7 +807,6 @@ class BrowserUseServer:
return f'Clicked element {index} and opened in new tab {full_url[:20]}...'
else:
# For non-link elements, just do a normal click
- # Opening in new tab without href is not reliably supported
from browser_use.browser.events import ClickElementEvent
event = self.browser_session.event_bus.dispatch(ClickElementEvent(node=element))
@@ -790,23 +866,39 @@ class BrowserUseServer:
else:
return f"Typed '{text}' into element {index}"
- async def _get_browser_state(self, include_screenshot: bool = False) -> str:
- """Get current browser state."""
+ async def _get_browser_state(self, include_screenshot: bool = False) -> tuple[str, str | None]:
+ """Get current browser state. Returns (state_json, screenshot_b64 | None)."""
if not self.browser_session:
- return 'Error: No browser session active'
+ return 'Error: No browser session active', None
state = await self.browser_session.get_browser_state_summary()
- result = {
+ result: dict[str, Any] = {
'url': state.url,
'title': state.title,
'tabs': [{'url': tab.url, 'title': tab.title} for tab in state.tabs],
'interactive_elements': [],
}
+ # Add viewport info so the LLM knows the coordinate space
+ if state.page_info:
+ pi = state.page_info
+ result['viewport'] = {
+ 'width': pi.viewport_width,
+ 'height': pi.viewport_height,
+ }
+ result['page'] = {
+ 'width': pi.page_width,
+ 'height': pi.page_height,
+ }
+ result['scroll'] = {
+ 'x': pi.scroll_x,
+ 'y': pi.scroll_y,
+ }
+
# Add interactive elements with their indices
for index, element in state.dom_state.selector_map.items():
- elem_info = {
+ elem_info: dict[str, Any] = {
'index': index,
'tag': element.tag_name,
'text': element.get_all_children_text(max_depth=2)[:100],
@@ -817,10 +909,69 @@ class BrowserUseServer:
elem_info['href'] = element.attributes['href']
result['interactive_elements'].append(elem_info)
+ # Return screenshot separately as ImageContent instead of embedding base64 in JSON
+ screenshot_b64 = None
if include_screenshot and state.screenshot:
- result['screenshot'] = state.screenshot
+ screenshot_b64 = state.screenshot
+ # Include viewport dimensions in JSON so LLM can map pixels to coordinates
+ if state.page_info:
+ result['screenshot_dimensions'] = {
+ 'width': state.page_info.viewport_width,
+ 'height': state.page_info.viewport_height,
+ }
- return json.dumps(result, indent=2)
+ return json.dumps(result, indent=2), screenshot_b64
+
+ async def _get_html(self, selector: str | None = None) -> str:
+ """Get raw HTML of the page or a specific element."""
+ if not self.browser_session:
+ return 'Error: No browser session active'
+
+ self._update_session_activity(self.browser_session.id)
+
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=None, focus=False)
+ if not cdp_session:
+ return 'Error: No active CDP session'
+
+ if selector:
+ js = (
+ f'(function(){{ const el = document.querySelector({json.dumps(selector)}); return el ? el.outerHTML : null; }})()'
+ )
+ else:
+ js = 'document.documentElement.outerHTML'
+
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
+ params={'expression': js, 'returnByValue': True},
+ session_id=cdp_session.session_id,
+ )
+ html = result.get('result', {}).get('value')
+ if html is None:
+ return f'No element found for selector: {selector}' if selector else 'Error: Could not get page HTML'
+ return html
+
+ async def _screenshot(self, full_page: bool = False) -> tuple[str, str | None]:
+ """Take a screenshot. Returns (metadata_json, screenshot_b64 | None)."""
+ if not self.browser_session:
+ return 'Error: No browser session active', None
+
+ import base64
+
+ self._update_session_activity(self.browser_session.id)
+
+ data = await self.browser_session.take_screenshot(full_page=full_page)
+ b64 = base64.b64encode(data).decode()
+
+ # Return screenshot separately as ImageContent instead of embedding base64 in JSON
+ state = await self.browser_session.get_browser_state_summary()
+ result: dict[str, Any] = {
+ 'size_bytes': len(data),
+ }
+ if state.page_info:
+ result['viewport'] = {
+ 'width': state.page_info.viewport_width,
+ 'height': state.page_info.viewport_height,
+ }
+ return json.dumps(result), b64
async def _extract_content(self, query: str, extract_links: bool = False) -> str:
"""Extract content from current page."""
@@ -1075,19 +1226,25 @@ class BrowserUseServer:
# Start the cleanup task
await self._start_cleanup_task()
+ if sys.stdin is None:
+ raise RuntimeError('MCP stdio transport requires stdin, but this process was launched without one.')
+
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
- await self.server.run(
- read_stream,
- write_stream,
- InitializationOptions(
- server_name='browser-use',
- server_version='0.1.0',
- capabilities=self.server.get_capabilities(
- notification_options=NotificationOptions(),
- experimental_capabilities={},
+ try:
+ await self.server.run(
+ read_stream,
+ write_stream,
+ InitializationOptions(
+ server_name='browser-use',
+ server_version='0.1.0',
+ capabilities=self.server.get_capabilities(
+ notification_options=NotificationOptions(),
+ experimental_capabilities={},
+ ),
),
- ),
- )
+ )
+ except BrokenPipeError:
+ logger.warning('MCP client disconnected while writing to stdio; shutting down server cleanly.')
async def main(session_timeout_minutes: int = 10):
diff --git a/browser_use/skill_cli/README.md b/browser_use/skill_cli/README.md
index b39faa4a8..640c3f99c 100644
--- a/browser_use/skill_cli/README.md
+++ b/browser_use/skill_cli/README.md
@@ -24,20 +24,10 @@ curl -fsSL https://browser-use.com/cli/install.sh | bash
& "C:\Program Files\Git\bin\bash.exe" -c 'curl -fsSL https://browser-use.com/cli/install.sh | bash'
```
-### Installation Modes
-```bash
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full # All modes
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --local-only # Local browser only
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only # Cloud browser only
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --api-key bu_xxx # With API key
-```
-
### Post-Install
```bash
browser-use doctor # Validate installation
browser-use setup # Run setup wizard (optional)
-browser-use setup --mode local|remote|full # Non-interactive setup
-browser-use setup --api-key bu_xxx --yes # With API key, skip prompts
```
### Generate Templates
@@ -62,13 +52,10 @@ If you prefer not to use the one-line installer:
# 1. Install the package
uv pip install browser-use
-# 2. Install Chromium (for local browser mode)
+# 2. Install Chromium
browser-use install
-# 3. Configure API key (for remote mode)
-export BROWSER_USE_API_KEY=your_key # or $env:BROWSER_USE_API_KEY on Windows
-
-# 4. Validate
+# 3. Validate
browser-use doctor
```
@@ -106,11 +93,20 @@ browser-use open https://example.com
# Visible browser window
browser-use --headed open https://example.com
-# Use your real Chrome (with existing logins/cookies)
-browser-use --browser real open https://gmail.com
+# Use your real Chrome with Default profile (with existing logins/cookies)
+browser-use --profile "Default" open https://gmail.com
-# Cloud browser (requires BROWSER_USE_API_KEY)
-browser-use --browser remote open https://example.com
+# Use a specific Chrome profile
+browser-use --profile "Profile 1" open https://gmail.com
+
+# Auto-discover and connect to running Chrome
+browser-use --connect open https://example.com
+
+# Connect to an existing browser via CDP URL
+browser-use --cdp-url http://localhost:9222 open https://example.com
+
+# WebSocket CDP URL also works
+browser-use --cdp-url ws://localhost:9222/devtools/browser/... state
```
## All Commands
@@ -135,11 +131,13 @@ browser-use --browser remote open https://example.com
| Command | Description |
|---------|-------------|
| `click ` | Click element by index |
+| `click ` | Click at pixel coordinates |
| `type "text"` | Type into focused element |
| `input "text"` | Click element, then type |
| `keys "Enter"` | Send keyboard keys |
| `keys "Control+a"` | Send key combination |
| `select "value"` | Select dropdown option |
+| `upload ` | Upload file to file input element |
| `hover ` | Hover over element |
| `dblclick ` | Double-click element |
| `rightclick ` | Right-click element |
@@ -147,9 +145,10 @@ browser-use --browser remote open https://example.com
### Tabs
| Command | Description |
|---------|-------------|
-| `switch ` | Switch to tab by index |
-| `close-tab` | Close current tab |
-| `close-tab ` | Close specific tab |
+| `tab list` | List all tabs |
+| `tab new [url]` | Open new tab |
+| `tab switch ` | Switch to tab by index |
+| `tab close [index...]` | Close tab(s) (current if no index) |
### Cookies
| Command | Description |
@@ -188,7 +187,7 @@ browser-use --browser remote open https://example.com
| Command | Description |
|---------|-------------|
| `eval "js code"` | Execute JavaScript |
-| `extract "query"` | Extract data with LLM |
+| `extract "query"` | Extract data with LLM (not yet implemented) |
### Python (Persistent Session)
```bash
@@ -200,88 +199,45 @@ browser-use python --reset # Clear namespace
browser-use python --file script.py # Run Python file
```
-## Agent Tasks
+## Cloud API
-Run AI-powered browser automation tasks.
-
-### Local Mode
-```bash
-browser-use run "Fill the contact form with test data"
-browser-use run "Extract all product prices" --max-steps 50
-browser-use run "task" --llm gpt-4o # Specify LLM model
-```
-
-Requires an LLM API key (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, etc.).
-
-### Remote Mode (Cloud)
-```bash
-browser-use -b remote run "Search for AI news" # US proxy default
-browser-use -b remote run "task" --llm gpt-4o # Specify LLM
-browser-use -b remote run "task" --proxy-country gb # UK proxy
-browser-use -b remote run "task" --session-id # Reuse session
-browser-use -b remote run "task" --no-wait # Async (returns task ID)
-browser-use -b remote run "task" --wait # Wait for completion
-browser-use -b remote run "task" --stream # Stream output
-browser-use -b remote run "task" --flash # Fast mode
-browser-use -b remote run "task" --keep-alive # Keep session alive
-browser-use -b remote run "task" --thinking # Extended reasoning
-browser-use -b remote run "task" --vision # Enable vision (default)
-browser-use -b remote run "task" --no-vision # Disable vision
-browser-use -b remote run "task" --profile # Use cloud profile
-
-# Task configuration
-browser-use -b remote run "task" --start-url https://example.com # Start from URL
-browser-use -b remote run "task" --allowed-domain example.com # Restrict navigation (repeatable)
-browser-use -b remote run "task" --metadata key=value # Task metadata (repeatable)
-browser-use -b remote run "task" --secret API_KEY=xxx # Task secrets (repeatable)
-browser-use -b remote run "task" --skill-id skill-123 # Enable skills (repeatable)
-
-# Structured output and evaluation
-browser-use -b remote run "task" --structured-output '{"type":"object"}' # JSON schema
-browser-use -b remote run "task" --judge # Enable judge mode
-browser-use -b remote run "task" --judge-ground-truth "answer" # Expected answer
-```
-
-Requires `BROWSER_USE_API_KEY`.
-
-## Task Management (Remote Mode)
-
-Manage cloud tasks when using `--browser remote`.
+Generic REST passthrough to the Browser-Use Cloud API, plus cloud browser provisioning.
| Command | Description |
|---------|-------------|
-| `task list` | List recent tasks |
-| `task list --status running` | Filter by status |
-| `task list --session ` | Filter by session ID |
-| `task status ` | Get task status (latest step only) |
-| `task status -c` | Compact: all steps with reasoning |
-| `task status -v` | Verbose: full details |
-| `task status --last 5` | Show last 5 steps |
-| `task status --step 3` | Show specific step number |
-| `task status --reverse` | Show steps newest first |
-| `task stop ` | Stop running task |
-| `task logs ` | Get execution logs |
+| `cloud connect` | Provision cloud browser and connect (zero-config, auto-manages profile) |
+| `cloud login ` | Save API key |
+| `cloud logout` | Remove API key |
+| `cloud v2 GET ` | GET request to API v2 |
+| `cloud v2 POST ''` | POST request to API v2 |
+| `cloud v3 POST ''` | POST request to API v3 |
+| `cloud v2 poll ` | Poll task until done |
+| `cloud v2 --help` | Show API v2 endpoints (from OpenAPI spec) |
+| `cloud v3 --help` | Show API v3 endpoints |
-## Cloud Sessions (Remote Mode)
+```bash
+# Save API key to ~/.browser-use/config.json
+browser-use cloud login sk-abc123...
-Manage cloud browser sessions.
+# Provision a cloud browser and connect
+browser-use cloud connect
+browser-use state # works normally
+browser-use close # disconnects AND stops cloud browser
-| Command | Description |
-|---------|-------------|
-| `session list` | List cloud sessions |
-| `session list --status active` | Filter by status |
-| `session get ` | Get session details + live URL |
-| `session stop ` | Stop session |
-| `session stop --all` | Stop all active sessions |
-| `session create` | Create new session |
-| `session create --profile ` | With cloud profile |
-| `session create --proxy-country gb` | With geographic proxy |
-| `session create --start-url ` | Start at specific URL |
-| `session create --screen-size 1920x1080` | Custom screen size |
-| `session create --keep-alive` | Keep session alive |
-| `session create --persist-memory` | Persist memory between tasks |
-| `session share ` | Create public share URL |
-| `session share --delete` | Delete public share |
+# List browsers
+browser-use cloud v2 GET /browsers
+
+# Create a task
+browser-use cloud v2 POST /tasks '{"task":"Search for AI news","url":"https://google.com"}'
+
+# Poll until done
+browser-use cloud v2 poll
+
+# Remove API key
+browser-use cloud logout
+```
+
+API key stored in `~/.browser-use/config.json` with `0600` permissions.
## Tunnels
@@ -298,55 +254,70 @@ Expose local dev servers to cloud browsers via Cloudflare tunnels.
# Example: Test local dev server with cloud browser
npm run dev & # localhost:3000
browser-use tunnel 3000 # → https://abc.trycloudflare.com
-browser-use -b remote open https://abc.trycloudflare.com
+browser-use cloud connect # Provision cloud browser
+browser-use open https://abc.trycloudflare.com
```
## Profile Management
-### Local Profiles (`-b real`)
-| Command | Description |
-|---------|-------------|
-| `profile list` | List Chrome profiles |
-| `profile cookies ` | Show cookies by domain |
-| `profile sync --from ` | Sync local profile to cloud |
-| `profile sync --from Default --domain youtube.com` | Sync specific domain only |
+The `profile` subcommand delegates to the [profile-use](https://github.com/browser-use/profile-use) Go binary, which syncs local browser cookies to Browser-Use cloud.
-### Cloud Profiles (`-b remote`)
-| Command | Description |
-|---------|-------------|
-| `profile list` | List cloud profiles |
-| `profile list --page 2 --page-size 50` | Pagination |
-| `profile get ` | Get profile details |
-| `profile create` | Create profile |
-| `profile create --name "My Profile"` | Create with name |
-| `profile update --name ` | Rename profile |
-| `profile delete ` | Delete profile |
-
-## Local Session Management
+The binary is managed at `~/.browser-use/bin/profile-use` and auto-downloaded on first use.
| Command | Description |
|---------|-------------|
-| `sessions` | List active sessions |
-| `close` | Close browser session |
+| `profile` | Interactive sync wizard |
+| `profile list` | List detected browsers and profiles |
+| `profile sync --all` | Sync all profiles to cloud |
+| `profile sync --browser "Google Chrome" --profile "Default"` | Sync specific profile |
+| `profile auth --apikey ` | Set API key (shared with `cloud login`) |
+| `profile inspect --browser "Google Chrome" --profile "Default"` | Inspect cookies locally |
+| `profile update` | Download/update the profile-use binary |
+
+## Session Management
+
+| Command | Description |
+|---------|-------------|
+| `sessions` | List active browser sessions |
+| `close` | Close current session's browser and daemon |
| `close --all` | Close all sessions |
-| `server status` | Check if server is running |
-| `server stop` | Stop server |
-| `server logs` | View server logs |
+| `--session NAME` | Target a named session (default: "default") |
+
+```bash
+# Default behavior unchanged
+browser-use open https://example.com # uses session 'default'
+browser-use state # talks to 'default' daemon
+
+# Named sessions
+browser-use --session work open https://example.com
+browser-use --session work state
+browser-use --session cloud cloud connect
+
+# List active sessions
+browser-use sessions
+
+# Close specific session
+browser-use --session work close
+
+# Close all sessions
+browser-use close --all
+
+# Env var fallback
+BROWSER_USE_SESSION=work browser-use state
+```
## Global Options
| Option | Description |
|--------|-------------|
-| `--session NAME` | Use named session (default: "default") |
-| `--browser MODE` | Browser mode: chromium, real, remote |
| `--headed` | Show browser window |
-| `--profile NAME` | Browser profile (local name or cloud ID) |
+| `--profile [NAME]` | Use real Chrome (bare `--profile` uses "Default") |
+| `--connect` | Auto-discover and connect to running Chrome via CDP |
+| `--cdp-url ` | Connect to existing browser via CDP URL (`http://` or `ws://`) |
+| `--session NAME` | Target a named session (default: "default", env: `BROWSER_USE_SESSION`) |
| `--json` | Output as JSON |
-| `--api-key KEY` | Override API key |
| `--mcp` | Run as MCP server via stdin/stdout |
-**Session behavior**: All commands without `--session` use the same "default" session. The browser stays open and is reused across commands. Use `--session NAME` to run multiple browsers in parallel.
-
## Examples
### Fill a Form
@@ -365,15 +336,6 @@ browser-use open https://news.ycombinator.com
browser-use eval "Array.from(document.querySelectorAll('.titleline a')).slice(0,5).map(a => a.textContent)"
```
-### Multi-Session Workflow
-```bash
-browser-use --session work open https://work.example.com
-browser-use --session personal open https://personal.example.com
-browser-use --session work state
-browser-use --session personal state
-browser-use close --all
-```
-
### Python Automation
```bash
browser-use open https://example.com
@@ -385,19 +347,6 @@ browser.screenshot('scrolled.png')
"
```
-### Cloud Agent with Session Reuse
-```bash
-# Start task, keep session alive
-browser-use -b remote run "Log into example.com" --keep-alive --no-wait
-# → task_id: task-123, session_id: sess-456
-
-# Check task status
-browser-use task status task-123
-
-# Run another task in same session (preserves login)
-browser-use -b remote run "Go to settings" --session-id sess-456
-```
-
## Claude Code Skill
For [Claude Code](https://claude.ai/code), a skill provides richer context for browser automation:
@@ -410,15 +359,34 @@ curl -o ~/.claude/skills/browser-use/SKILL.md \
## How It Works
-The CLI uses a session server architecture:
+The CLI uses a multi-session daemon architecture:
-1. First command starts a background server (browser stays open)
+1. First command starts a background daemon for that session (browser stays open)
2. Subsequent commands communicate via Unix socket (or TCP on Windows)
3. Browser persists across commands for fast interaction
-4. Server auto-starts when needed, stops with `browser-use server stop`
+4. Each `--session` gets its own daemon, socket, and PID file in `~/.browser-use/`
+5. Daemon auto-starts when needed, auto-exits when browser dies, or stops with `browser-use close`
This gives you ~50ms command latency instead of waiting for browser startup each time.
+### File Layout
+
+All CLI-managed files live under `~/.browser-use/` (override with `BROWSER_USE_HOME`):
+
+```
+~/.browser-use/
+├── config.json # API key, settings (shared with profile-use)
+├── bin/
+│ └── profile-use # Managed Go binary (auto-downloaded)
+├── tunnels/
+│ ├── {port}.json # Tunnel metadata
+│ └── {port}.log # Tunnel logs
+├── default.state.json # Daemon lifecycle state (phase, PID, config)
+├── default.sock # Daemon socket (ephemeral)
+├── default.pid # Daemon PID (ephemeral)
+└── cli.log # Daemon log
+```
+
Windows Troubleshooting
@@ -444,11 +412,11 @@ echo $env:PATH
& "C:\Program Files\Git\bin\bash.exe" -c 'browser-use --help'
```
-### "Failed to start session server" error
+### "Failed to start daemon" error
Kill zombie processes:
```powershell
-# Find process on port
-netstat -ano | findstr 49698
+# Find browser-use Python processes
+tasklist | findstr python
# Kill by PID
taskkill /PID /F
diff --git a/browser_use/skill_cli/__init__.py b/browser_use/skill_cli/__init__.py
index 49b3c444a..dd6f24c90 100644
--- a/browser_use/skill_cli/__init__.py
+++ b/browser_use/skill_cli/__init__.py
@@ -1,14 +1,13 @@
"""Browser-use CLI package.
This package provides a fast command-line interface for browser automation.
-The CLI uses a session server architecture for persistent browser sessions.
+The CLI uses a daemon architecture for persistent browser sessions.
Usage:
browser-use open https://example.com
browser-use click 5
browser-use type "Hello World"
browser-use python "print(browser.url)"
- browser-use run "Fill the contact form"
browser-use close
"""
diff --git a/browser_use/skill_cli/actions.py b/browser_use/skill_cli/actions.py
new file mode 100644
index 000000000..cea6a96c3
--- /dev/null
+++ b/browser_use/skill_cli/actions.py
@@ -0,0 +1,201 @@
+"""Direct action execution for CLI daemon — no event bus dispatch.
+
+Wraps DefaultActionWatchdog methods and DomService for direct calling.
+The watchdog instance is NOT registered on the event bus — it's just
+used as a library of action implementations.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+from bubus import EventBus
+
+from browser_use.browser.events import (
+ GoBackEvent,
+ SelectDropdownOptionEvent,
+ SendKeysEvent,
+ TypeTextEvent,
+ UploadFileEvent,
+)
+from browser_use.browser.watchdogs.default_action_watchdog import DefaultActionWatchdog
+from browser_use.dom.service import DomService
+from browser_use.dom.views import EnhancedDOMTreeNode, SerializedDOMState
+
+if TYPE_CHECKING:
+ from browser_use.browser.session import BrowserSession
+ from browser_use.browser.views import BrowserStateSummary, PageInfo
+
+logger = logging.getLogger('browser_use.skill_cli.actions')
+
+
+class ActionHandler:
+ """Execute browser actions directly without the event bus.
+
+ Uses DefaultActionWatchdog methods for complex actions (click, type, keys, etc.)
+ and DomService for DOM snapshots. All other actions use direct CDP calls.
+ """
+
+ def __init__(self, browser_session: BrowserSession) -> None:
+ self.bs = browser_session
+ # Create watchdog instance — NOT registered on event bus
+ self._watchdog = DefaultActionWatchdog(
+ event_bus=EventBus(), # dummy, never dispatched to
+ browser_session=browser_session,
+ )
+ self._dom_service: DomService | None = None
+
+ async def navigate(self, url: str) -> None:
+ """Navigate the focused tab to a URL."""
+ assert self.bs.agent_focus_target_id is not None, 'No focused tab'
+ await self.bs._navigate_and_wait(url, self.bs.agent_focus_target_id)
+
+ async def click_element(self, node: EnhancedDOMTreeNode) -> dict[str, Any] | None:
+ """Click an element using the watchdog's full implementation (with fallbacks)."""
+ return await self._watchdog._click_element_node_impl(node)
+
+ async def click_coordinate(self, x: int, y: int) -> dict[str, Any] | None:
+ """Click at coordinates."""
+ from browser_use.browser.events import ClickCoordinateEvent
+
+ event = ClickCoordinateEvent(coordinate_x=x, coordinate_y=y)
+ return await self._watchdog.on_ClickCoordinateEvent(event)
+
+ async def type_text(self, node: EnhancedDOMTreeNode, text: str) -> dict[str, Any] | None:
+ """Type text into an element."""
+ event = TypeTextEvent(node=node, text=text)
+ return await self._watchdog.on_TypeTextEvent(event)
+
+ async def scroll(self, direction: str, amount: int) -> None:
+ """Scroll the page using JS (CDP gesture doesn't work in --connect mode)."""
+ if direction in ('down', 'up'):
+ x, y = 0, (amount if direction == 'down' else -amount)
+ else:
+ x, y = (amount if direction == 'right' else -amount), 0
+ cdp_session = await self.bs.get_or_create_cdp_session()
+ assert cdp_session is not None, 'No CDP session for scroll'
+ await cdp_session.cdp_client.send.Runtime.evaluate(
+ params={'expression': f'window.scrollBy({x}, {y})', 'awaitPromise': False},
+ session_id=cdp_session.session_id,
+ )
+
+ async def go_back(self) -> None:
+ """Go back in history."""
+ event = GoBackEvent()
+ await self._watchdog.on_GoBackEvent(event)
+
+ async def send_keys(self, keys: str) -> None:
+ """Send keyboard keys."""
+ event = SendKeysEvent(keys=keys)
+ await self._watchdog.on_SendKeysEvent(event)
+
+ async def select_dropdown(self, node: EnhancedDOMTreeNode, text: str) -> dict[str, str]:
+ """Select a dropdown option."""
+ event = SelectDropdownOptionEvent(node=node, text=text)
+ return await self._watchdog.on_SelectDropdownOptionEvent(event)
+
+ async def upload_file(self, node: EnhancedDOMTreeNode, file_path: str) -> None:
+ """Upload a file to a file input element."""
+ event = UploadFileEvent(node=node, file_path=file_path)
+ await self._watchdog.on_UploadFileEvent(event)
+
+ async def get_state(self) -> BrowserStateSummary:
+ """Build DOM via DomService directly (no DOMWatchdog, no event bus)."""
+ from browser_use.browser.views import BrowserStateSummary, PageInfo
+
+ if self._dom_service is None:
+ self._dom_service = DomService(browser_session=self.bs)
+
+ page_url = await self.bs.get_current_page_url()
+
+ # Fast path for non-http pages
+ if page_url.lower().split(':', 1)[0] not in ('http', 'https'):
+ return BrowserStateSummary(
+ dom_state=SerializedDOMState(_root=None, selector_map={}),
+ url=page_url,
+ title='Empty Tab',
+ tabs=await self.bs.get_tabs(),
+ screenshot=None,
+ page_info=None,
+ )
+
+ # Build DOM and take screenshot in parallel
+ import asyncio
+
+ dom_task = asyncio.create_task(self._dom_service.get_serialized_dom_tree())
+ screenshot_task = asyncio.create_task(self.bs.take_screenshot())
+
+ dom_state: SerializedDOMState | None = None
+ screenshot_b64: str | None = None
+
+ try:
+ dom_state, _tree, _timing = await dom_task
+ except Exception as e:
+ logger.warning(f'DOM build failed: {e}')
+ dom_state = SerializedDOMState(_root=None, selector_map={})
+
+ try:
+ screenshot_bytes = await screenshot_task
+ import base64
+
+ screenshot_b64 = base64.b64encode(screenshot_bytes).decode() if screenshot_bytes else None
+ except Exception as e:
+ logger.warning(f'Screenshot failed: {e}')
+
+ # Update cached selector map for element lookups
+ if dom_state and dom_state.selector_map:
+ self.bs.update_cached_selector_map(dom_state.selector_map)
+
+ # Get page info
+ page_info: PageInfo | None = None
+ try:
+ cdp_session = await self.bs.get_or_create_cdp_session(target_id=None, focus=False)
+ if cdp_session:
+ metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
+ css_metrics = metrics.get('cssLayoutViewport', {})
+ content_size = metrics.get('cssContentSize', metrics.get('contentSize', {}))
+ visual_viewport = metrics.get('cssVisualViewport', metrics.get('visualViewport', {}))
+ page_info = PageInfo(
+ viewport_width=int(css_metrics.get('clientWidth', 0)),
+ viewport_height=int(css_metrics.get('clientHeight', 0)),
+ page_width=int(content_size.get('width', 0)),
+ page_height=int(content_size.get('height', 0)),
+ scroll_x=int(visual_viewport.get('pageX', 0)),
+ scroll_y=int(visual_viewport.get('pageY', 0)),
+ pixels_above=int(visual_viewport.get('pageY', 0)),
+ pixels_below=max(
+ 0,
+ int(content_size.get('height', 0))
+ - int(css_metrics.get('clientHeight', 0))
+ - int(visual_viewport.get('pageY', 0)),
+ ),
+ pixels_left=0,
+ pixels_right=0,
+ )
+ except Exception as e:
+ logger.debug(f'Failed to get page info: {e}')
+
+ tabs = await self.bs.get_tabs()
+
+ # Use focused tab's title, not tabs[0]
+ title = ''
+ focused_id = self.bs.agent_focus_target_id
+ found_focused = False
+ for tab in tabs:
+ if tab.target_id == focused_id:
+ title = tab.title
+ found_focused = True
+ break
+ if not found_focused and tabs:
+ title = tabs[0].title
+
+ return BrowserStateSummary(
+ dom_state=dom_state,
+ url=page_url,
+ title=title,
+ tabs=tabs,
+ screenshot=screenshot_b64,
+ page_info=page_info,
+ closed_popup_messages=self.bs._closed_popup_messages.copy(),
+ )
diff --git a/browser_use/skill_cli/api_key.py b/browser_use/skill_cli/api_key.py
deleted file mode 100644
index 172cc8150..000000000
--- a/browser_use/skill_cli/api_key.py
+++ /dev/null
@@ -1,167 +0,0 @@
-"""API key management for browser-use CLI."""
-
-import json
-import os
-import sys
-from pathlib import Path
-
-
-class APIKeyRequired(Exception):
- """Raised when API key is required but not provided."""
-
- pass
-
-
-def get_config_path() -> Path:
- """Get browser-use config file path."""
- if sys.platform == 'win32':
- base = Path(os.environ.get('APPDATA', Path.home()))
- else:
- base = Path(os.environ.get('XDG_CONFIG_HOME', Path.home() / '.config'))
- return base / 'browser-use' / 'config.json'
-
-
-def require_api_key(feature: str = 'this feature') -> str:
- """Get API key or raise helpful error.
-
- Checks in order:
- 1. BROWSER_USE_API_KEY environment variable
- 2. Config file (~/.config/browser-use/config.json)
- 3. Interactive prompt (if TTY)
- 4. Raises APIKeyRequired with helpful message
- """
- # 1. Check environment
- key = os.environ.get('BROWSER_USE_API_KEY')
- if key:
- return key
-
- # 2. Check config file
- config_path = get_config_path()
- if config_path.exists():
- try:
- config = json.loads(config_path.read_text())
- if key := config.get('api_key'):
- return key
- except Exception:
- pass
-
- # 3. Interactive prompt (if TTY)
- if sys.stdin.isatty() and sys.stdout.isatty():
- return prompt_for_api_key(feature)
-
- # 4. Error with helpful message
- raise APIKeyRequired(
- f"""
-╭─────────────────────────────────────────────────────────────╮
-│ 🔑 Browser-Use API Key Required │
-│ │
-│ {feature} requires an API key. │
-│ │
-│ Get yours at: https://browser-use.com/new-api-key │
-│ │
-│ Then set it via: │
-│ export BROWSER_USE_API_KEY=your_key_here │
-│ │
-│ Or add to {config_path}: │
-│ {{"api_key": "your_key_here"}} │
-╰─────────────────────────────────────────────────────────────╯
-"""
- )
-
-
-def prompt_for_api_key(feature: str) -> str:
- """Interactive prompt for API key."""
- print(
- f"""
-╭─────────────────────────────────────────────────────────────╮
-│ 🔑 Browser-Use API Key Required │
-│ │
-│ {feature} requires an API key. │
-│ Get yours at: https://browser-use.com/new-api-key │
-╰─────────────────────────────────────────────────────────────╯
-"""
- )
-
- try:
- key = input('Enter API key: ').strip()
- except (EOFError, KeyboardInterrupt):
- raise APIKeyRequired('No API key provided')
-
- if not key:
- raise APIKeyRequired('No API key provided')
-
- try:
- save = input('Save to config? [y/N]: ').strip().lower()
- if save == 'y':
- save_api_key(key)
- except (EOFError, KeyboardInterrupt):
- pass
-
- return key
-
-
-def save_api_key(key: str) -> None:
- """Save API key to config file."""
- config_path = get_config_path()
- config_path.parent.mkdir(parents=True, exist_ok=True)
-
- config: dict = {}
- if config_path.exists():
- try:
- config = json.loads(config_path.read_text())
- except Exception:
- pass
-
- config['api_key'] = key
- config_path.write_text(json.dumps(config, indent=2))
- # Restrict permissions to owner only (0600)
- config_path.chmod(0o600)
- print(f'Saved to {config_path}')
-
-
-def get_api_key() -> str | None:
- """Get API key if available, without raising error."""
- try:
- return require_api_key('API key check')
- except APIKeyRequired:
- return None
-
-
-def check_api_key() -> dict[str, bool | str | None]:
- """Check API key availability without interactive prompts.
-
- Returns:
- Dict with keys:
- - 'available': bool - whether API key is configured
- - 'source': str | None - where it came from ('env', 'config', or None)
- - 'key_prefix': str | None - first 8 chars of key (for display)
- """
- # Check environment
- key = os.environ.get('BROWSER_USE_API_KEY')
- if key:
- return {
- 'available': True,
- 'source': 'env',
- 'key_prefix': key[:8] if len(key) >= 8 else key,
- }
-
- # Check config file
- config_path = get_config_path()
- if config_path.exists():
- try:
- config = json.loads(config_path.read_text())
- if key := config.get('api_key'):
- return {
- 'available': True,
- 'source': 'config',
- 'key_prefix': key[:8] if len(key) >= 8 else key,
- }
- except Exception:
- pass
-
- # Not available
- return {
- 'available': False,
- 'source': None,
- 'key_prefix': None,
- }
diff --git a/browser_use/skill_cli/browser.py b/browser_use/skill_cli/browser.py
new file mode 100644
index 000000000..9582d2a7b
--- /dev/null
+++ b/browser_use/skill_cli/browser.py
@@ -0,0 +1,225 @@
+"""Lightweight BrowserSession subclass for the CLI daemon.
+
+Skips watchdogs, event bus handlers, and auto-reconnect for ALL modes.
+Launches browser if needed, then calls connect() directly.
+All inherited methods (get_element_by_index, take_screenshot, etc.)
+work because this IS a BrowserSession.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import psutil
+
+from browser_use.browser.session import BrowserSession
+
+logger = logging.getLogger('browser_use.skill_cli.browser')
+
+
+class CLIBrowserSession(BrowserSession):
+ """BrowserSession that skips watchdogs and event bus for all modes.
+
+ For --connect: connects to existing Chrome via CDP URL.
+ For managed Chromium: launches browser, gets CDP URL, connects.
+ For cloud: provisions browser, gets CDP URL, connects.
+
+ All three modes converge at connect() — no watchdogs, no event bus.
+ """
+
+ _browser_process: psutil.Process | None = None # type: ignore[assignment]
+
+ async def start(self) -> None:
+ """Launch/provision browser if needed, then connect lightweight."""
+ if self.cdp_url:
+ # --connect or --cdp-url: CDP URL already known
+ pass
+ elif self.browser_profile.use_cloud:
+ # Cloud: provision browser via API
+ await self._provision_cloud_browser()
+ else:
+ # Managed Chromium: launch browser process
+ await self._launch_local_browser()
+
+ # All modes: lightweight CDP connection (no watchdogs)
+ await self.connect()
+
+ # Prevent heavy monitoring on future tabs
+ if self.session_manager:
+
+ async def _noop(cdp_session: object) -> None:
+ pass
+
+ self.session_manager._enable_page_monitoring = _noop # type: ignore[assignment]
+
+ # Disable auto-reconnect — daemon should die when CDP drops
+ self._intentional_stop = True
+
+ # Register popup/dialog handler so JS alerts don't freeze Chrome
+ await self._register_dialog_handler()
+
+ async def _register_dialog_handler(self) -> None:
+ """Register CDP handler to auto-dismiss JS dialogs (alert, confirm, prompt).
+
+ Without this, any JS dialog freezes all CDP commands until manually dismissed.
+ Messages are stored in _closed_popup_messages for inclusion in state output.
+ """
+ import asyncio as _asyncio
+
+ if not self._cdp_client_root:
+ return
+
+ async def handle_dialog(event_data: dict, session_id: str | None = None) -> None:
+ try:
+ dialog_type = event_data.get('type', 'alert')
+ message = event_data.get('message', '')
+ if message:
+ self._closed_popup_messages.append(f'[{dialog_type}] {message}')
+ # Accept alerts/confirms/beforeunload, dismiss prompts
+ should_accept = dialog_type in ('alert', 'confirm', 'beforeunload')
+ logger.info(f'Auto-{"accepting" if should_accept else "dismissing"} {dialog_type}: {message[:100]}')
+ if not self._cdp_client_root:
+ return
+ await _asyncio.wait_for(
+ self._cdp_client_root.send.Page.handleJavaScriptDialog(
+ params={'accept': should_accept},
+ session_id=session_id,
+ ),
+ timeout=0.5,
+ )
+ except Exception:
+ pass
+
+ # Try to enable Page domain on root client (may fail — not all CDP targets support it)
+ try:
+ await self._cdp_client_root.send.Page.enable()
+ except Exception:
+ pass
+ self._cdp_client_root.register.Page.javascriptDialogOpening(handle_dialog) # type: ignore[arg-type]
+
+ async def _launch_local_browser(self) -> None:
+ """Launch Chromium using LocalBrowserWatchdog's launch logic."""
+ from bubus import EventBus
+
+ from browser_use.browser.watchdogs.local_browser_watchdog import LocalBrowserWatchdog
+
+ # Instantiate watchdog as plain object — NOT registered on event bus
+ launcher = LocalBrowserWatchdog(event_bus=EventBus(), browser_session=self)
+ process, cdp_url = await launcher._launch_browser()
+ self._browser_process = process
+ self.browser_profile.cdp_url = cdp_url
+ logger.info(f'Launched browser (PID {process.pid}), CDP: {cdp_url}')
+
+ async def _provision_cloud_browser(self) -> None:
+ """Provision a cloud browser and set the CDP URL."""
+ import os
+
+ from browser_use.browser.cloud.views import CreateBrowserRequest
+
+ # Override cloud API base URL if set (CLI injects this into daemon env).
+ # CloudBrowserClient expects the host URL (it appends /api/v2/... internally).
+ cloud_base = os.environ.get('BROWSER_USE_CLOUD_BASE_URL')
+ if cloud_base:
+ self._cloud_browser_client.api_base_url = cloud_base.rstrip('/')
+
+ # Ensure CLI has an API key from config.json before proceeding.
+ from browser_use.skill_cli.config import get_config_value
+
+ if not get_config_value('api_key'):
+ from browser_use.browser.cloud.views import CloudBrowserAuthError
+
+ raise CloudBrowserAuthError(
+ 'No API key configured. Run `browser-use cloud login ` or `browser-use cloud signup`.'
+ )
+
+ cloud_params = self.browser_profile.cloud_browser_params or CreateBrowserRequest()
+ # Set recording from CLI config (defaults to True)
+ from browser_use.skill_cli.config import get_config_value
+
+ cloud_params.enable_recording = bool(get_config_value('cloud_connect_recording'))
+
+ try:
+ cloud_response = await self._cloud_browser_client.create_browser(cloud_params)
+ except Exception as e:
+ # If profile is invalid, create a new one and retry once
+ if 'profile' in str(e).lower() or '422' in str(e):
+ logger.info('Cloud profile invalid, creating new one and retrying')
+ from browser_use.skill_cli.commands.cloud import _create_cloud_profile_inner
+
+ api_key = get_config_value('api_key')
+ if not api_key:
+ raise
+ new_profile_id = _create_cloud_profile_inner(str(api_key))
+ cloud_params.profile_id = new_profile_id
+ cloud_response = await self._cloud_browser_client.create_browser(cloud_params)
+ else:
+ raise
+ self.browser_profile.cdp_url = cloud_response.cdpUrl
+ self.browser_profile.is_local = False
+ logger.info(f'Cloud browser provisioned, CDP: {cloud_response.cdpUrl}')
+
+ async def stop(self) -> None:
+ """Disconnect from the browser.
+
+ For --connect/--cdp-url: just close the websocket (we don't own the browser).
+ For cloud: stop the remote browser via API before disconnecting.
+ """
+ self._intentional_stop = True
+ # Stop cloud browser if we provisioned one
+ if self.browser_profile.use_cloud and self._cloud_browser_client.current_session_id:
+ try:
+ import asyncio as _asyncio
+
+ await _asyncio.wait_for(self._cloud_browser_client.stop_browser(), timeout=5.0)
+ except Exception as e:
+ logger.debug(f'Error stopping cloud browser: {e}')
+ if self._cdp_client_root:
+ try:
+ await self._cdp_client_root.stop()
+ except Exception as e:
+ logger.debug(f'Error closing CDP client: {e}')
+ self._cdp_client_root = None # type: ignore[assignment]
+ if self.session_manager:
+ try:
+ await self.session_manager.clear()
+ except Exception as e:
+ logger.debug(f'Error clearing session manager: {e}')
+ self.session_manager = None
+ self.agent_focus_target_id = None
+ self._cached_selector_map.clear()
+
+ async def kill(self) -> None:
+ """Send Browser.close to kill the browser, then disconnect.
+
+ For managed Chromium: sends Browser.close CDP command + terminates process.
+ """
+ if self._cdp_client_root:
+ try:
+ await self._cdp_client_root.send.Browser.close()
+ except Exception:
+ pass
+ await self.stop()
+ # Force kill the process if we launched it and it's still alive
+ if self._browser_process:
+ try:
+ if self._browser_process.is_running():
+ self._browser_process.terminate()
+ self._browser_process.wait(timeout=5)
+ except Exception:
+ try:
+ self._browser_process.kill()
+ except Exception:
+ pass
+ self._browser_process = None
+
+ @property
+ def is_cdp_connected(self) -> bool:
+ """Check if CDP WebSocket connection is alive."""
+ if self._cdp_client_root is None or self._cdp_client_root.ws is None:
+ return False
+ try:
+ from websockets.protocol import State
+
+ return self._cdp_client_root.ws.state is State.OPEN
+ except Exception:
+ return False
diff --git a/browser_use/skill_cli/commands/__init__.py b/browser_use/skill_cli/commands/__init__.py
index cc9403b16..37e4849ad 100644
--- a/browser_use/skill_cli/commands/__init__.py
+++ b/browser_use/skill_cli/commands/__init__.py
@@ -1,23 +1,15 @@
"""Command handlers for browser-use CLI."""
from browser_use.skill_cli.commands import (
- agent,
browser,
- cloud_session,
- cloud_task,
doctor,
python_exec,
- session,
setup,
)
__all__ = [
- 'agent',
'browser',
- 'cloud_session',
- 'cloud_task',
'doctor',
'python_exec',
- 'session',
'setup',
]
diff --git a/browser_use/skill_cli/commands/agent.py b/browser_use/skill_cli/commands/agent.py
deleted file mode 100644
index 6609cb945..000000000
--- a/browser_use/skill_cli/commands/agent.py
+++ /dev/null
@@ -1,335 +0,0 @@
-"""Agent task command handler."""
-
-import logging
-import os
-from typing import Any
-
-from browser_use.skill_cli.api_key import APIKeyRequired, require_api_key
-from browser_use.skill_cli.sessions import SessionInfo
-
-logger = logging.getLogger(__name__)
-
-# Cloud-only flags that only work in remote mode
-CLOUD_ONLY_FLAGS = [
- 'session_id',
- 'proxy_country',
- 'wait',
- 'stream',
- 'flash',
- 'keep_alive',
- 'thinking',
- 'start_url',
- 'metadata',
- 'secret',
- 'allowed_domain',
- 'skill_id',
- 'structured_output',
- 'judge',
- 'judge_ground_truth',
-]
-
-
-async def handle(session: SessionInfo, params: dict[str, Any]) -> Any:
- """Handle agent run command.
-
- Routes based on browser mode:
- - Remote mode (--browser remote): Uses Cloud API with US proxy by default
- - Local mode (default): Uses local browser-use agent
- """
- task = params.get('task')
- if not task:
- return {'success': False, 'error': 'No task provided'}
-
- # Route based on browser mode
- if session.browser_mode == 'remote':
- # Remote mode requires Browser-Use API key
- try:
- require_api_key('Cloud agent tasks')
- except APIKeyRequired as e:
- return {'success': False, 'error': str(e)}
- return await _handle_cloud_task(params)
- else:
- # Check if user tried to use cloud-only flags in local mode
- used_cloud_flags = [f for f in CLOUD_ONLY_FLAGS if params.get(f)]
- if used_cloud_flags:
- from browser_use.skill_cli.install_config import is_mode_available
-
- flags_str = ', '.join(f'--{f.replace("_", "-")}' for f in used_cloud_flags)
-
- if is_mode_available('remote'):
- # Remote is available, user just needs to use it
- return {
- 'success': False,
- 'error': f'Cloud-only flags used in local mode: {flags_str}\nUse --browser remote to enable cloud features.',
- }
- else:
- # Remote not installed (--local-only install)
- return {
- 'success': False,
- 'error': f'Cloud-only flags require remote mode: {flags_str}\n'
- f'Remote mode is not installed. Reinstall to enable:\n'
- f' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only\n'
- f' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full',
- }
- return await _handle_local_task(session, params)
-
-
-async def _handle_cloud_task(params: dict[str, Any]) -> Any:
- """Handle task execution via Cloud API.
-
- By default uses US proxy for all cloud tasks.
- """
- from browser_use.skill_cli.commands import cloud_session, cloud_task
-
- task = params['task']
-
- # Handle vision flag (--vision vs --no-vision)
- vision: bool | None = None
- if params.get('vision'):
- vision = True
- elif params.get('no_vision'):
- vision = False
-
- # Parse key=value list params
- metadata = _parse_key_value_list(params.get('metadata'))
- secrets = _parse_key_value_list(params.get('secret'))
-
- # Build session params - only include what user explicitly set
- session_id = params.get('session_id')
- profile_id = params.get('profile')
- proxy_country = params.get('proxy_country')
-
- try:
- logger.info(f'Creating cloud task: {task}')
-
- # Create session first if profile or proxy specified and no session_id
- if (profile_id or proxy_country) and not session_id:
- session = cloud_session.create_session(
- profile_id=profile_id,
- proxy_country=proxy_country,
- keep_alive=params.get('keep_alive'),
- )
- session_id = session.id
- logger.info(f'Created cloud session: {session_id}')
-
- # Create cloud task - only pass what user explicitly set
- task_response = cloud_task.create_task(
- task=task,
- llm=params.get('llm'),
- session_id=session_id,
- max_steps=params.get('max_steps'),
- flash_mode=params.get('flash'),
- thinking=params.get('thinking'),
- vision=vision,
- start_url=params.get('start_url'),
- metadata=metadata,
- secrets=secrets,
- allowed_domains=params.get('allowed_domain'),
- skill_ids=params.get('skill_id'),
- structured_output=params.get('structured_output'),
- judge=params.get('judge'),
- judge_ground_truth=params.get('judge_ground_truth'),
- )
-
- task_id = task_response.id
- response_session_id = task_response.session_id
-
- if not task_id:
- return {
- 'success': False,
- 'error': 'Cloud API did not return a task ID',
- 'task': task,
- }
-
- logger.info(f'Cloud task created: {task_id}')
-
- # Return immediately unless --wait is specified
- if not params.get('wait'):
- return {
- 'success': True,
- 'task_id': task_id,
- 'session_id': response_session_id,
- 'message': 'Task started. Use "browser-use task status " to check progress.',
- }
-
- # Poll until complete
- logger.info('Waiting for task completion...')
- result = await cloud_task.poll_until_complete(task_id, stream=params.get('stream', False))
-
- return {
- 'success': True,
- 'task': task,
- 'task_id': task_id,
- 'session_id': response_session_id,
- 'status': result.status,
- 'output': result.output,
- 'cost': result.cost,
- 'done': result.status == 'finished',
- }
-
- except Exception as e:
- logger.exception(f'Cloud task failed: {e}')
- return {
- 'success': False,
- 'error': str(e),
- 'task': task,
- }
-
-
-def _parse_key_value_list(items: list[str] | None) -> dict[str, str | None] | None:
- """Parse a list of 'key=value' strings into a dict."""
- if not items:
- return None
- result: dict[str, str | None] = {}
- for item in items:
- if '=' in item:
- key, value = item.split('=', 1)
- result[key] = value
- return result if result else None
-
-
-async def _handle_local_task(session: SessionInfo, params: dict[str, Any]) -> Any:
- """Handle task execution locally with browser-use agent."""
- task = params['task']
- max_steps = params.get('max_steps')
- model = params.get('llm') # Optional model override
-
- try:
- # Import agent and LLM
- from browser_use.agent.service import Agent
-
- # Try to get LLM from environment (with optional model override)
- llm = await get_llm(model=model)
- if llm is None:
- if model:
- return {
- 'success': False,
- 'error': f'Could not initialize model "{model}". '
- f'Make sure the appropriate API key is set (OPENAI_API_KEY, ANTHROPIC_API_KEY, or GOOGLE_API_KEY).',
- }
- return {
- 'success': False,
- 'error': 'No LLM configured. Set BROWSER_USE_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, or GOOGLE_API_KEY',
- }
-
- # Create and run agent
- agent = Agent(
- task=task,
- llm=llm,
- browser_session=session.browser_session,
- )
-
- logger.info(f'Running local agent task: {task}')
- run_kwargs = {}
- if max_steps is not None:
- run_kwargs['max_steps'] = max_steps
- result = await agent.run(**run_kwargs)
-
- # Extract result info
- final_result = result.final_result() if result else None
-
- return {
- 'success': True,
- 'task': task,
- 'steps': len(result) if result else 0,
- 'result': str(final_result) if final_result else None,
- 'done': result.is_done() if result else False,
- }
-
- except Exception as e:
- logger.exception(f'Local agent task failed: {e}')
- return {
- 'success': False,
- 'error': str(e),
- 'task': task,
- }
-
-
-def _get_verified_models() -> dict[str, set[str]]:
- """Extract verified model names from SDK sources of truth."""
- import typing
-
- from anthropic.types.model_param import ModelParam
- from openai.types.shared.chat_model import ChatModel
-
- from browser_use.llm.google.chat import VerifiedGeminiModels
-
- # OpenAI: ChatModel is a Literal type
- openai_models = set(typing.get_args(ChatModel))
-
- # Anthropic: ModelParam is Union[Literal[...], str] - extract the Literal
- anthropic_literal = typing.get_args(ModelParam)[0]
- anthropic_models = set(typing.get_args(anthropic_literal))
-
- # Google: VerifiedGeminiModels Literal
- google_models = set(typing.get_args(VerifiedGeminiModels))
-
- # Browser-Use: cloud models
- browser_use_models = {'bu-latest', 'bu-1-0', 'bu-2-0'}
-
- return {
- 'openai': openai_models,
- 'anthropic': anthropic_models,
- 'google': google_models,
- 'browser-use': browser_use_models,
- }
-
-
-_VERIFIED_MODELS: dict[str, set[str]] | None = None
-
-
-def _get_provider_for_model(model: str) -> str | None:
- """Determine the provider by checking SDK verified model lists."""
- global _VERIFIED_MODELS
- if _VERIFIED_MODELS is None:
- _VERIFIED_MODELS = _get_verified_models()
-
- for provider, models in _VERIFIED_MODELS.items():
- if model in models:
- return provider
-
- return None
-
-
-def get_llm(model: str | None = None) -> Any:
- """Get LLM instance from environment configuration.
-
- Args:
- model: Optional model name to use. If provided, will instantiate
- the appropriate provider for that model. If not provided,
- auto-detects from available API keys.
-
- Supported providers: OpenAI, Anthropic, Google, Browser-Use.
- Model names are validated against each SDK's verified model list.
- """
- from browser_use.llm import ChatAnthropic, ChatBrowserUse, ChatGoogle, ChatOpenAI
-
- if model:
- provider = _get_provider_for_model(model)
-
- if provider == 'openai':
- return ChatOpenAI(model=model)
- elif provider == 'anthropic':
- return ChatAnthropic(model=model)
- elif provider == 'google':
- return ChatGoogle(model=model)
- elif provider == 'browser-use':
- return ChatBrowserUse(model=model)
- else:
- logger.warning(f'Unknown model: {model}. Not in any verified model list.')
- return None
-
- # No model specified - auto-detect from available API keys
- if os.environ.get('BROWSER_USE_API_KEY'):
- return ChatBrowserUse()
-
- if os.environ.get('OPENAI_API_KEY'):
- return ChatOpenAI(model='o3')
-
- if os.environ.get('ANTHROPIC_API_KEY'):
- return ChatAnthropic(model='claude-sonnet-4-0')
-
- if os.environ.get('GOOGLE_API_KEY'):
- return ChatGoogle(model='gemini-flash-latest')
-
- return None
diff --git a/browser_use/skill_cli/commands/browser.py b/browser_use/skill_cli/commands/browser.py
index b7516ef42..221e45aad 100644
--- a/browser_use/skill_cli/commands/browser.py
+++ b/browser_use/skill_cli/commands/browser.py
@@ -19,10 +19,10 @@ COMMANDS = {
'back',
'screenshot',
'state',
- 'switch',
- 'close-tab',
+ 'tab',
'keys',
'select',
+ 'upload',
'eval',
'extract',
'cookies',
@@ -81,18 +81,16 @@ async def _get_element_center(session: SessionInfo, node: Any) -> tuple[float, f
async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> Any:
"""Handle browser control command."""
bs = session.browser_session
+ actions = session.actions
+ if actions is None:
+ return {'error': 'ActionHandler not initialized'}
if action == 'open':
url = params['url']
- # Ensure URL has scheme
if not url.startswith(('http://', 'https://', 'file://')):
url = 'https://' + url
-
- from browser_use.browser.events import NavigateToUrlEvent
-
- await bs.event_bus.dispatch(NavigateToUrlEvent(url=url))
+ await actions.navigate(url)
result: dict[str, Any] = {'url': url}
- # Add live preview URL for cloud browsers
if bs.browser_profile.use_cloud and bs.cdp_url:
from urllib.parse import quote
@@ -100,18 +98,22 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
return result
elif action == 'click':
- from browser_use.browser.events import ClickElementEvent
-
- index = params['index']
- # Look up node from selector map
- node = await bs.get_element_by_index(index)
- if node is None:
- return {'error': f'Element index {index} not found - page may have changed'}
- await bs.event_bus.dispatch(ClickElementEvent(node=node))
- return {'clicked': index}
+ args = params.get('args', [])
+ if len(args) == 2:
+ x, y = args
+ await actions.click_coordinate(x, y)
+ return {'clicked_coordinate': {'x': x, 'y': y}}
+ elif len(args) == 1:
+ index = args[0]
+ node = await bs.get_element_by_index(index)
+ if node is None:
+ return {'error': f'Element index {index} not found - page may have changed'}
+ await actions.click_element(node)
+ return {'clicked': index}
+ else:
+ return {'error': 'Usage: click or click '}
elif action == 'type':
- # Type into currently focused element using CDP directly
text = params['text']
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if not cdp_session:
@@ -123,30 +125,23 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
return {'typed': text}
elif action == 'input':
- from browser_use.browser.events import ClickElementEvent, TypeTextEvent
-
index = params['index']
text = params['text']
- # Look up node from selector map
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
- await bs.event_bus.dispatch(ClickElementEvent(node=node))
- await bs.event_bus.dispatch(TypeTextEvent(node=node, text=text))
+ await actions.click_element(node)
+ await actions.type_text(node, text)
return {'input': text, 'element': index}
elif action == 'scroll':
- from browser_use.browser.events import ScrollEvent
-
direction = params.get('direction', 'down')
amount = params.get('amount', 500)
- await bs.event_bus.dispatch(ScrollEvent(direction=direction, amount=amount))
+ await actions.scroll(direction, amount)
return {'scrolled': direction, 'amount': amount}
elif action == 'back':
- from browser_use.browser.events import GoBackEvent
-
- await bs.event_bus.dispatch(GoBackEvent())
+ await actions.go_back()
return {'back': True}
elif action == 'screenshot':
@@ -161,59 +156,133 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
return {'screenshot': base64.b64encode(data).decode(), 'size': len(data)}
elif action == 'state':
- # Return the same LLM representation that browser-use agents see
- state_text = await bs.get_state_as_text()
+ state = await actions.get_state()
+ assert state.dom_state is not None
+ state_text = state.dom_state.llm_representation()
+
+ # Prepend viewport dimensions
+ if state.page_info:
+ pi = state.page_info
+ viewport_text = f'viewport: {pi.viewport_width}x{pi.viewport_height}\n'
+ viewport_text += f'page: {pi.page_width}x{pi.page_height}\n'
+ viewport_text += f'scroll: ({pi.scroll_x}, {pi.scroll_y})\n'
+ state_text = viewport_text + state_text
+
+ # Append auto-dismissed popup messages
+ if bs._closed_popup_messages:
+ state_text += '\nAuto-closed dialogs:\n'
+ for msg in bs._closed_popup_messages:
+ state_text += f' {msg}\n'
+ bs._closed_popup_messages.clear()
+
return {'_raw_text': state_text}
- elif action == 'switch':
- from browser_use.browser.events import SwitchTabEvent
+ elif action == 'tab':
+ tab_command = params.get('tab_command')
- tab_index = params['tab']
- # Get target_id from tab index
- page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
- if tab_index < 0 or tab_index >= len(page_targets):
- return {'error': f'Invalid tab index {tab_index}. Available: 0-{len(page_targets) - 1}'}
- target_id = page_targets[tab_index].target_id
- await bs.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
- return {'switched': tab_index}
+ if tab_command == 'list':
+ page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
+ lines = ['TAB URL']
+ for i, t in enumerate(page_targets):
+ lines.append(f'{i:<4} {t.url}')
+ return {'_raw_text': '\n'.join(lines)}
- elif action == 'close-tab':
- from browser_use.browser.events import CloseTabEvent
+ elif tab_command == 'new':
+ url = params.get('url', 'about:blank')
+ target_id = await bs._cdp_create_new_page(url, background=True)
+ bs.agent_focus_target_id = target_id
+ return {'created': target_id[:8], 'url': url}
- tab_index = params.get('tab')
- # Get target_id from tab index
- page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
- if tab_index is not None:
+ elif tab_command == 'switch':
+ tab_index = params['tab']
+ page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
if tab_index < 0 or tab_index >= len(page_targets):
return {'error': f'Invalid tab index {tab_index}. Available: 0-{len(page_targets) - 1}'}
- target_id = page_targets[tab_index].target_id
- else:
- # Close current/focused tab
- target_id = bs.session_manager.get_focused_target().target_id if bs.session_manager else None
- if not target_id:
- return {'error': 'No focused tab to close'}
- await bs.event_bus.dispatch(CloseTabEvent(target_id=target_id))
- return {'closed': tab_index}
+ bs.agent_focus_target_id = page_targets[tab_index].target_id
+ return {'switched': tab_index}
+
+ elif tab_command == 'close':
+ tab_indices = params.get('tabs', [])
+
+ page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
+
+ async def _close_target(tid: str) -> None:
+ cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
+ if cdp_session:
+ await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': tid})
+
+ if not tab_indices:
+ # Use caller's logical focus, not Chrome's global focus
+ target_id = bs.agent_focus_target_id
+ if not target_id:
+ target_id = bs.session_manager.get_focused_target().target_id if bs.session_manager else None
+ if not target_id:
+ return {'error': 'No focused tab to close'}
+ await _close_target(target_id)
+ return {'closed': [0]}
+
+ closed = []
+ errors = []
+ for idx in sorted(tab_indices, reverse=True):
+ if idx < 0 or idx >= len(page_targets):
+ errors.append(f'Tab {idx} out of range')
+ continue
+ try:
+ await _close_target(page_targets[idx].target_id)
+ closed.append(idx)
+ except Exception as e:
+ errors.append(f'Tab {idx}: {e}')
+ result: dict[str, Any] = {'closed': closed}
+ if errors:
+ result['errors'] = errors
+ return result
+
+ return {'error': 'Invalid tab command. Use: list, new, switch, close'}
elif action == 'keys':
- from browser_use.browser.events import SendKeysEvent
-
keys = params['keys']
- await bs.event_bus.dispatch(SendKeysEvent(keys=keys))
+ await actions.send_keys(keys)
return {'sent': keys}
elif action == 'select':
- from browser_use.browser.events import SelectDropdownOptionEvent
-
index = params['index']
value = params['value']
- # Look up node from selector map
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
- await bs.event_bus.dispatch(SelectDropdownOptionEvent(node=node, text=value))
+ await actions.select_dropdown(node, value)
return {'selected': value, 'element': index}
+ elif action == 'upload':
+ index = params['index']
+ file_path = params['path']
+
+ p = Path(file_path)
+ if not p.exists():
+ return {'error': f'File not found: {file_path}'}
+ if not p.is_file():
+ return {'error': f'Not a file: {file_path}'}
+ if p.stat().st_size == 0:
+ return {'error': f'File is empty (0 bytes): {file_path}'}
+
+ node = await bs.get_element_by_index(index)
+ if node is None:
+ return {'error': f'Element index {index} not found - page may have changed'}
+
+ file_input_node = bs.find_file_input_near_element(node)
+
+ if file_input_node is None:
+ selector_map = await bs.get_selector_map()
+ file_input_indices = [idx for idx, el in selector_map.items() if bs.is_file_input(el)]
+ if file_input_indices:
+ hint = f' File input(s) found at index: {", ".join(map(str, file_input_indices))}'
+ else:
+ hint = ' No file input found on the page.'
+ return {'error': f'Element {index} is not a file input.{hint}'}
+
+ await actions.upload_file(file_input_node, file_path)
+ return {'uploaded': file_path, 'element': index}
+
elif action == 'eval':
js = params['js']
# Execute JavaScript via CDP
@@ -224,7 +293,7 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
query = params['query']
# This requires LLM integration
# For now, return a placeholder
- return {'query': query, 'error': 'extract requires agent mode - use: browser-use run "extract ..."'}
+ return {'query': query, 'error': 'extract is not yet implemented'}
elif action == 'hover':
index = params['index']
@@ -473,7 +542,7 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
]
file_path = Path(params['file'])
- file_path.write_text(json.dumps(cookie_list, indent=2))
+ file_path.write_text(json.dumps(cookie_list, indent=2, ensure_ascii=False), encoding='utf-8')
return {'exported': len(cookie_list), 'file': str(file_path)}
elif cookies_command == 'import':
diff --git a/browser_use/skill_cli/commands/cloud.py b/browser_use/skill_cli/commands/cloud.py
new file mode 100644
index 000000000..198d0bc1c
--- /dev/null
+++ b/browser_use/skill_cli/commands/cloud.py
@@ -0,0 +1,694 @@
+"""Cloud API command — generic REST passthrough to Browser-Use Cloud.
+
+Stdlib only. No async, no SDK, no heavy imports.
+
+Usage:
+ browser-use cloud login
+ browser-use cloud logout
+ browser-use cloud v2 GET /browsers
+ browser-use cloud v2 POST /tasks '{"task":"...","url":"https://..."}'
+ browser-use cloud v2 poll
+ browser-use cloud v2 --help
+"""
+
+import json
+import os
+import sys
+import time
+import typing
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+_DEFAULT_BASE_URL = 'https://api.browser-use.com'
+_AUTH_HEADER = 'X-Browser-Use-API-Key'
+
+
+def _get_base() -> str:
+ """Get the API host URL. All paths are appended by callers."""
+ return os.environ.get('BROWSER_USE_CLOUD_BASE_URL', _DEFAULT_BASE_URL).rstrip('/')
+
+
+def _base_url(version: str) -> str:
+ """Get versioned API URL: {base}/api/{version}"""
+ per_version = os.environ.get(f'BROWSER_USE_CLOUD_BASE_URL_{version.upper()}')
+ if per_version:
+ return per_version
+ return f'{_get_base()}/api/{version}'
+
+
+def _spec_url(version: str) -> str:
+ per_version = os.environ.get(f'BROWSER_USE_OPENAPI_SPEC_URL_{version.upper()}')
+ if per_version:
+ return per_version
+ return f'{_get_base()}/api/{version}/openapi.json'
+
+
+# ---------------------------------------------------------------------------
+# API key persistence
+# ---------------------------------------------------------------------------
+
+
+def _get_config_path() -> Path:
+ from browser_use.skill_cli.utils import get_config_path
+
+ return get_config_path()
+
+
+def _read_config() -> dict:
+ from browser_use.skill_cli.config import read_config
+
+ return read_config()
+
+
+def _write_config(data: dict) -> None:
+ from browser_use.skill_cli.config import write_config
+
+ write_config(data)
+
+
+def _get_api_key_or_none() -> str | None:
+ """Return API key from CLI config file, or None if not found."""
+ from browser_use.skill_cli.config import get_config_value
+
+ val = get_config_value('api_key')
+ return str(val) if val is not None else None
+
+
+def _get_api_key() -> str:
+ """Return API key from config file. Exits with error if missing."""
+ key = _get_api_key_or_none()
+ if key:
+ return key
+
+ print('Error: No API key found.', file=sys.stderr)
+ if os.environ.get('BROWSER_USE_API_KEY'):
+ print(' Note: BROWSER_USE_API_KEY env var is set but not used by the CLI.', file=sys.stderr)
+ print(' Run: browser-use config set api_key "$BROWSER_USE_API_KEY"', file=sys.stderr)
+ else:
+ print(
+ 'Already have an account? Get a key at: https://cloud.browser-use.com/settings?tab=api-keys&new=1&utm_source=oss&utm_medium=cli',
+ file=sys.stderr,
+ )
+ print(' Then run: browser-use cloud login ', file=sys.stderr)
+ print('No account? Run: browser-use cloud signup', file=sys.stderr)
+ print(' This creates an agent account you can claim later with: browser-use cloud signup --claim', file=sys.stderr)
+ sys.exit(1)
+
+
+def _create_cloud_profile_inner(api_key: str) -> str:
+ """Create a new cloud profile and save to config. Returns profile ID.
+
+ Raises RuntimeError on failure — safe to call from daemon context.
+ """
+ body = json.dumps({'name': 'Browser Use CLI'}).encode()
+ status, resp = _http_request('POST', f'{_base_url("v2")}/profiles', body, api_key)
+ if status >= 400:
+ raise RuntimeError(f'Error creating cloud profile: HTTP {status} — {resp}')
+
+ try:
+ data = json.loads(resp)
+ new_id = data['id']
+ except (json.JSONDecodeError, KeyError, TypeError):
+ raise RuntimeError(f'Unexpected response from cloud API: {resp}')
+
+ config = _read_config()
+ config['cloud_connect_profile_id'] = new_id
+ _write_config(config)
+ return new_id
+
+
+def _create_cloud_profile() -> str:
+ """Create a new cloud profile and save to config. Returns profile ID.
+
+ CLI entry point — exits on error.
+ """
+ api_key = _get_api_key()
+ try:
+ return _create_cloud_profile_inner(api_key)
+ except RuntimeError as e:
+ print(str(e), file=sys.stderr)
+ sys.exit(1)
+
+
+def _get_or_create_cloud_profile() -> str:
+ """Return cloud profile ID from config, creating one if missing. No validation HTTP call."""
+ config = _read_config()
+ profile_id = config.get('cloud_connect_profile_id')
+ if profile_id:
+ return profile_id
+ return _create_cloud_profile()
+
+
+def _get_cloud_connect_proxy() -> str | None:
+ """Return the cloud connect proxy country code from config."""
+ from browser_use.skill_cli.config import get_config_value
+
+ val = get_config_value('cloud_connect_proxy')
+ return str(val) if val is not None else None
+
+
+def _get_cloud_connect_timeout() -> int | None:
+ """Return the cloud connect timeout (minutes) from config."""
+ from browser_use.skill_cli.config import get_config_value
+
+ val = get_config_value('cloud_connect_timeout')
+ return int(val) if val is not None else None
+
+
+def _save_api_key(key: str) -> None:
+ config = _read_config()
+ config['api_key'] = key
+ _write_config(config)
+
+
+def _remove_api_key() -> bool:
+ config = _read_config()
+ if 'api_key' not in config:
+ return False
+ del config['api_key']
+ path = _get_config_path()
+ if config:
+ _write_config(config)
+ else:
+ path.unlink(missing_ok=True)
+ return True
+
+
+# ---------------------------------------------------------------------------
+# HTTP helpers
+# ---------------------------------------------------------------------------
+
+
+def _http_request(method: str, url: str, body: bytes | None, api_key: str, timeout: float = 30.0) -> tuple[int, bytes]:
+ """Fire an HTTP request. Returns (status_code, response_body)."""
+ headers = {_AUTH_HEADER: api_key}
+ if body is not None:
+ headers['Content-Type'] = 'application/json'
+
+ req = urllib.request.Request(url, data=body, headers=headers, method=method.upper())
+ try:
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
+ return resp.status, resp.read()
+ except urllib.error.HTTPError as e:
+ return e.code, e.read()
+ except urllib.error.URLError as e:
+ print(f'Error: {e.reason}', file=sys.stderr)
+ sys.exit(1)
+
+
+def _print_json(data: bytes, file: typing.TextIO | None = None) -> None:
+ """Pretty-print JSON, raw fallback."""
+ out = file or sys.stdout
+ try:
+ parsed = json.loads(data)
+ print(json.dumps(parsed, indent=2), file=out)
+ except (json.JSONDecodeError, ValueError):
+ buf = out.buffer if hasattr(out, 'buffer') else sys.stdout.buffer
+ buf.write(data)
+ buf.write(b'\n')
+ buf.flush()
+
+
+# ---------------------------------------------------------------------------
+# OpenAPI help
+# ---------------------------------------------------------------------------
+
+
+def _fetch_spec(version: str) -> bytes | None:
+ url = _spec_url(version)
+ try:
+ req = urllib.request.Request(url)
+ with urllib.request.urlopen(req, timeout=5) as resp:
+ return resp.read()
+ except Exception:
+ return None
+
+
+def _example_value(prop: dict, schemas: dict) -> object:
+ """Generate a placeholder value for an OpenAPI property."""
+ if '$ref' in prop:
+ ref_name = prop['$ref'].rsplit('/', 1)[-1]
+ if ref_name in schemas:
+ return _generate_body_example_dict(ref_name, schemas)
+ return {}
+
+ t = prop.get('type', 'string')
+ fmt = prop.get('format', '')
+ enum = prop.get('enum')
+
+ if enum:
+ return enum[0]
+ if t == 'string':
+ if fmt == 'uri' or fmt == 'url':
+ return 'https://example.com'
+ if fmt == 'date-time':
+ return '2025-01-01T00:00:00Z'
+ if 'email' in fmt:
+ return 'user@example.com'
+ return '...'
+ if t == 'integer':
+ return 0
+ if t == 'number':
+ return 0.0
+ if t == 'boolean':
+ return False
+ if t == 'array':
+ items = prop.get('items', {})
+ return [_example_value(items, schemas)]
+ if t == 'object':
+ props = prop.get('properties', {})
+ return {k: _example_value(v, schemas) for k, v in props.items()}
+ return '...'
+
+
+def _generate_body_example_dict(ref_name: str, schemas: dict) -> dict:
+ """Build a compact example dict from a $ref schema."""
+ schema = schemas.get(ref_name, {})
+ props = schema.get('properties', {})
+ required = set(schema.get('required', []))
+
+ result = {}
+ # Required fields first, then sorted optional
+ for key in sorted(props, key=lambda k: (k not in required, k)):
+ result[key] = _example_value(props[key], schemas)
+ return result
+
+
+def _generate_body_example(ref: str, schemas: dict) -> str:
+ """Return compact JSON string for a $ref."""
+ ref_name = ref.rsplit('/', 1)[-1]
+ obj = _generate_body_example_dict(ref_name, schemas)
+ return json.dumps(obj, separators=(',', ':'))
+
+
+def _find_body_ref(spec: dict, method: str, path: str) -> str | None:
+ """Find the $ref for request body of a given method+path in spec."""
+ paths = spec.get('paths', {})
+ path_obj = paths.get(path, {})
+ method_obj = path_obj.get(method.lower(), {})
+ body = method_obj.get('requestBody', {})
+ content = body.get('content', {})
+ json_media = content.get('application/json', {})
+ schema = json_media.get('schema', {})
+ return schema.get('$ref')
+
+
+def _match_path(spec_path: str, req_path: str) -> bool:
+ """Match an OpenAPI template path against a concrete path.
+
+ E.g. /tasks/{task_id} matches /tasks/abc123
+ """
+ spec_parts = spec_path.strip('/').split('/')
+ req_parts = req_path.strip('/').split('/')
+ if len(spec_parts) != len(req_parts):
+ return False
+ for sp, rp in zip(spec_parts, req_parts):
+ if sp.startswith('{') and sp.endswith('}'):
+ continue
+ if sp != rp:
+ return False
+ return True
+
+
+def _find_body_example(spec: dict, method: str, path: str) -> str | None:
+ """Find a body example for the given method+path, using template matching."""
+ schemas = spec.get('components', {}).get('schemas', {})
+ paths = spec.get('paths', {})
+
+ for spec_path in paths:
+ if _match_path(spec_path, path):
+ ref = _find_body_ref(spec, method, spec_path)
+ if ref:
+ return _generate_body_example(ref, schemas)
+ return None
+
+
+def _format_openapi_help(spec_data: bytes) -> str:
+ """Parse OpenAPI spec and render grouped endpoints."""
+ try:
+ spec = json.loads(spec_data)
+ except (json.JSONDecodeError, ValueError):
+ return ''
+
+ paths = spec.get('paths', {})
+ schemas = spec.get('components', {}).get('schemas', {})
+ info = spec.get('info', {})
+
+ lines: list[str] = []
+ title = info.get('title', 'API')
+ version = info.get('version', '')
+ lines.append(f'{title} {version}'.strip())
+ lines.append('')
+
+ # Group by tag
+ groups: dict[str, list[str]] = {}
+ for path, methods in sorted(paths.items()):
+ for method, details in sorted(methods.items()):
+ if method in ('parameters', 'summary', 'description'):
+ continue
+ tags = details.get('tags', ['Other'])
+ tag = tags[0] if tags else 'Other'
+ summary = details.get('summary', '')
+
+ # Build endpoint line
+ parts = [f' {method.upper():6s} {path}']
+ if summary:
+ parts.append(f' # {summary}')
+
+ # Parameters
+ params = details.get('parameters', [])
+ param_strs = []
+ for p in params:
+ name = p.get('name', '')
+ required = p.get('required', False)
+ marker = '*' if required else ''
+ param_strs.append(f'{name}{marker}')
+ if param_strs:
+ parts.append(f' params: {", ".join(param_strs)}')
+
+ # Body example
+ body_ref = _find_body_ref(spec, method, path)
+ if body_ref:
+ example = _generate_body_example(body_ref, schemas)
+ parts.append(f" body: '{example}'")
+
+ groups.setdefault(tag, []).append('\n'.join(parts) if len(parts) > 1 else parts[0])
+
+ for tag, endpoints in sorted(groups.items()):
+ lines.append(f'[{tag}]')
+ for ep in endpoints:
+ lines.append(ep)
+ lines.append('')
+
+ return '\n'.join(lines)
+
+
+def _static_help(version: str) -> str:
+ """Fallback help when OpenAPI spec is unavailable."""
+ return f"""Browser-Use Cloud API {version}
+
+Usage:
+ browser-use cloud {version} [body]
+ browser-use cloud {version} poll
+
+Examples:
+ browser-use cloud {version} GET /browsers
+ browser-use cloud {version} POST /tasks '{{"task":"Search for AI news","url":"https://google.com"}}'
+ browser-use cloud {version} GET /tasks/
+ browser-use cloud {version} poll
+
+(Could not fetch OpenAPI spec for live endpoint listing)
+"""
+
+
+# ---------------------------------------------------------------------------
+# Command handlers
+# ---------------------------------------------------------------------------
+
+
+def _cloud_login(argv: list[str]) -> int:
+ if not argv:
+ print('Usage: browser-use cloud login ', file=sys.stderr)
+ return 1
+
+ key = argv[0]
+ _save_api_key(key)
+ print('API key saved')
+ return 0
+
+
+def _cloud_logout() -> int:
+ if _remove_api_key():
+ print('API key removed')
+ else:
+ print('No API key to remove')
+ return 0
+
+
+def _cloud_rest(argv: list[str], version: str) -> int:
+ """Generic REST passthrough."""
+ if len(argv) < 2:
+ print(f'Usage: browser-use cloud {version} [body]', file=sys.stderr)
+ return 1
+
+ method = argv[0].upper()
+ path = argv[1]
+ body_str = argv[2] if len(argv) > 2 else None
+
+ # Normalize path
+ if not path.startswith('/'):
+ path = '/' + path
+
+ url = f'{_base_url(version)}{path}'
+ api_key = _get_api_key()
+
+ body = body_str.encode() if body_str else None
+ status, resp_body = _http_request(method, url, body, api_key)
+
+ if 400 <= status < 500:
+ print(f'HTTP {status}', file=sys.stderr)
+ _print_json(resp_body, file=sys.stderr)
+
+ # Try to suggest correct body from spec
+ spec_data = _fetch_spec(version)
+ if spec_data:
+ try:
+ spec = json.loads(spec_data)
+ example = _find_body_example(spec, method, path)
+ if example:
+ print(f"\nExpected body: '{example}'", file=sys.stderr)
+ except (json.JSONDecodeError, ValueError):
+ pass
+ return 2
+
+ if status >= 500:
+ print(f'HTTP {status}', file=sys.stderr)
+ _print_json(resp_body, file=sys.stderr)
+ return 1
+
+ _print_json(resp_body)
+ return 0
+
+
+def _cloud_poll(argv: list[str], version: str) -> int:
+ """Poll GET /tasks/ until done."""
+ if not argv:
+ print(f'Usage: browser-use cloud {version} poll ', file=sys.stderr)
+ return 1
+
+ task_id = argv[0]
+ url = f'{_base_url(version)}/tasks/{task_id}'
+ api_key = _get_api_key()
+
+ while True:
+ status_code, resp_body = _http_request('GET', url, None, api_key)
+
+ if status_code >= 400:
+ print(f'\nHTTP {status_code}', file=sys.stderr)
+ _print_json(resp_body, file=sys.stderr)
+ return 2
+
+ try:
+ data = json.loads(resp_body)
+ except (json.JSONDecodeError, ValueError):
+ print('\nError: invalid JSON response', file=sys.stderr)
+ return 1
+
+ task_status = data.get('status', 'unknown')
+ cost = data.get('cost', 0)
+ print(f'\rstatus: {task_status} cost: ${cost:.4f}', end='', file=sys.stderr, flush=True)
+
+ if task_status == 'finished':
+ print('', file=sys.stderr) # newline
+ _print_json(resp_body)
+ return 0
+
+ if task_status == 'failed':
+ print('', file=sys.stderr)
+ _print_json(resp_body, file=sys.stderr)
+ return 2
+
+ time.sleep(2)
+
+
+def _cloud_help(version: str) -> int:
+ """Show OpenAPI-driven help for a version."""
+ spec_data = _fetch_spec(version)
+ if spec_data:
+ formatted = _format_openapi_help(spec_data)
+ if formatted:
+ print(formatted)
+ return 0
+
+ print(_static_help(version))
+ return 0
+
+
+def _cloud_versioned(argv: list[str], version: str) -> int:
+ """Route versioned subcommands: poll, help, or REST passthrough."""
+ if not argv:
+ return _cloud_help(version)
+
+ first = argv[0]
+
+ if first in ('--help', 'help', '-h'):
+ return _cloud_help(version)
+
+ if first == 'poll':
+ return _cloud_poll(argv[1:], version)
+
+ # REST passthrough: METHOD path [body]
+ return _cloud_rest(argv, version)
+
+
+# ---------------------------------------------------------------------------
+# Signup (agent self-registration)
+# ---------------------------------------------------------------------------
+
+
+def _signup_challenge() -> int:
+ """Request a signup challenge."""
+ if _get_api_key_or_none():
+ print('You already have an API key configured.', file=sys.stderr)
+ print('Run `browser-use cloud signup --claim` to claim your account.', file=sys.stderr)
+ return 1
+
+ body = json.dumps({}).encode()
+ status, resp = _http_request('POST', f'{_get_base()}/cloud/signup', body, api_key='')
+ if status >= 400:
+ print(f'Error: HTTP {status}', file=sys.stderr)
+ _print_json(resp, file=sys.stderr)
+ return 1
+
+ try:
+ data = json.loads(resp)
+ except (json.JSONDecodeError, ValueError):
+ print('Error: invalid response', file=sys.stderr)
+ return 1
+
+ print(f'Challenge ID: {data["challenge_id"]}')
+ print(f'Challenge: {data["challenge_text"]}')
+ print()
+ print('Verify to create your agent account:')
+ print(' browser-use cloud signup --verify ')
+ return 0
+
+
+def _signup_verify(challenge_id: str, answer: str) -> int:
+ """Verify a signup challenge and save the API key."""
+ if _get_api_key_or_none():
+ print('You already have an API key configured.', file=sys.stderr)
+ print('Run `browser-use cloud signup --claim` to claim your account.', file=sys.stderr)
+ return 1
+
+ body = json.dumps({'challenge_id': challenge_id, 'answer': answer}).encode()
+ status, resp = _http_request('POST', f'{_get_base()}/cloud/signup/verify', body, api_key='')
+ if status >= 400:
+ print(f'Error: HTTP {status}', file=sys.stderr)
+ _print_json(resp, file=sys.stderr)
+ return 1
+
+ try:
+ data = json.loads(resp)
+ except (json.JSONDecodeError, ValueError):
+ print('Error: invalid response', file=sys.stderr)
+ return 1
+
+ _save_api_key(data['api_key'])
+ print('API key saved')
+ return 0
+
+
+def _signup_claim() -> int:
+ """Generate a claim URL for the current API key."""
+ api_key = _get_api_key()
+ status, resp = _http_request('POST', f'{_get_base()}/cloud/signup/claim', None, api_key)
+ if status >= 400:
+ print(f'Error: HTTP {status}', file=sys.stderr)
+ _print_json(resp, file=sys.stderr)
+ return 1
+
+ try:
+ data = json.loads(resp)
+ except (json.JSONDecodeError, ValueError):
+ print('Error: invalid response', file=sys.stderr)
+ return 1
+
+ print(f'Claim URL: {data["claim_url"]}')
+ print('Share this URL with a human to claim ownership of this account.')
+ return 0
+
+
+# ---------------------------------------------------------------------------
+# Main dispatcher
+# ---------------------------------------------------------------------------
+
+
+def handle_cloud_command(argv: list[str]) -> int:
+ """Main dispatcher for `browser-use cloud ...`."""
+ if not argv:
+ _print_cloud_usage()
+ return 1
+
+ subcmd = argv[0]
+
+ if subcmd == 'login':
+ return _cloud_login(argv[1:])
+
+ if subcmd == 'logout':
+ return _cloud_logout()
+
+ if subcmd in ('v2', 'v3'):
+ return _cloud_versioned(argv[1:], subcmd)
+
+ if subcmd == 'signup':
+ if '--verify' in argv:
+ idx = argv.index('--verify')
+ if idx + 2 >= len(argv):
+ print('Usage: browser-use cloud signup --verify ', file=sys.stderr)
+ return 1
+ return _signup_verify(argv[idx + 1], argv[idx + 2])
+ if '--claim' in argv:
+ return _signup_claim()
+ return _signup_challenge()
+
+ if subcmd == 'connect':
+ # Normally intercepted by main.py before reaching here
+ print('Error: cloud connect must be run via the main CLI (browser-use cloud connect)', file=sys.stderr)
+ return 1
+
+ if subcmd in ('--help', 'help', '-h'):
+ _print_cloud_usage()
+ return 0
+
+ print(f'Unknown cloud subcommand: {subcmd}', file=sys.stderr)
+ _print_cloud_usage()
+ return 1
+
+
+def _print_cloud_usage() -> None:
+ print('Usage: browser-use cloud ')
+ print()
+ print('Commands:')
+ print(' connect Provision cloud browser and connect')
+ print(' signup Create an agent account (challenge-response)')
+ print(' signup --verify Verify challenge and save API key')
+ print(' signup --claim Generate URL to claim your agent account')
+ print(' login Save API key')
+ print(' logout Remove API key')
+ print(' v2 [body] REST passthrough (API v2)')
+ print(' v3 [body] REST passthrough (API v3)')
+ print(' v2 poll Poll task until done')
+ print(' v2 --help Show API v2 endpoints')
+ print(' v3 --help Show API v3 endpoints')
+ print()
+ print('Examples:')
+ print(' browser-use cloud login sk-abc123...')
+ print(' browser-use cloud v2 GET /browsers')
+ print(' browser-use cloud v2 POST /tasks \'{"task":"...","url":"https://..."}\'')
+ print(' browser-use cloud v2 poll ')
diff --git a/browser_use/skill_cli/commands/cloud_session.py b/browser_use/skill_cli/commands/cloud_session.py
deleted file mode 100644
index 1faf9add3..000000000
--- a/browser_use/skill_cli/commands/cloud_session.py
+++ /dev/null
@@ -1,423 +0,0 @@
-"""Cloud session SDK wrappers and CLI handlers.
-
-This module provides:
-- SDK wrapper functions for the Browser-Use Cloud Session API
-- CLI command handlers for `browser-use session `
-"""
-
-import argparse
-import json
-import logging
-import sys
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any
-
-from browser_use_sdk.types.session_item_view import SessionItemView
-from browser_use_sdk.types.session_view import SessionView
-from browser_use_sdk.types.share_view import ShareView
-
-from browser_use.skill_cli.commands.utils import format_duration, get_sdk_client
-
-logger = logging.getLogger(__name__)
-
-
-# ============ SDK Wrappers ============
-
-
-def create_session(**kwargs: Any) -> SessionItemView:
- """Create a cloud browser session.
-
- Args:
- profile_id: Cloud profile ID for persistent auth/cookies
- proxy_country: Proxy country code (us, gb, de, etc.)
- keep_alive: Keep session alive after task completes
- persist_memory: Share memory between tasks in session
- start_url: URL to navigate to when session starts
- screen_width: Browser screen width in pixels
- screen_height: Browser screen height in pixels
-
- Returns:
- SessionItemView with session details
- """
- # Map our param names to SDK param names
- param_map = {
- 'proxy_country': 'proxy_country_code',
- 'screen_width': 'browser_screen_width',
- 'screen_height': 'browser_screen_height',
- }
- params = {}
- for k, v in kwargs.items():
- if v is not None:
- params[param_map.get(k, k)] = v
-
- return get_sdk_client().sessions.create_session(**params)
-
-
-def list_sessions(limit: int = 10, status: str | None = None) -> list[SessionItemView]:
- """List cloud browser sessions."""
- client = get_sdk_client()
- response = client.sessions.list_sessions(
- page_size=min(limit, 100),
- filter_by=status,
- )
- return list(response.items) if response.items else []
-
-
-def get_session(session_id: str) -> SessionView:
- """Get details of a specific session."""
- return get_sdk_client().sessions.get_session(session_id)
-
-
-def stop_session(session_id: str) -> SessionView:
- """Stop a cloud session."""
- return get_sdk_client().sessions.update_session(session_id, action='stop')
-
-
-def delete_session(session_id: str) -> None:
- """Delete a cloud session and all its tasks."""
- get_sdk_client().sessions.delete_session(session_id)
-
-
-def create_public_share(session_id: str) -> ShareView:
- """Create a public share URL for a session."""
- return get_sdk_client().sessions.create_session_public_share(session_id)
-
-
-def delete_public_share(session_id: str) -> None:
- """Delete the public share for a session."""
- get_sdk_client().sessions.delete_session_public_share(session_id)
-
-
-def stop_sessions_parallel(session_ids: list[str]) -> tuple[list[str], list[dict[str, Any]]]:
- """Stop multiple cloud sessions in parallel."""
- client = get_sdk_client()
- stopped: list[str] = []
- errors: list[dict[str, Any]] = []
-
- def stop_one(sid: str) -> tuple[str, str | None]:
- try:
- client.sessions.update_session(sid, action='stop')
- return (sid, None)
- except Exception as e:
- return (sid, str(e))
-
- with ThreadPoolExecutor(max_workers=10) as executor:
- futures = {executor.submit(stop_one, sid): sid for sid in session_ids}
- for future in as_completed(futures):
- sid, error = future.result()
- if error:
- errors.append({'id': sid, 'error': error})
- else:
- stopped.append(sid)
-
- return stopped, errors
-
-
-# ============ CLI Handlers ============
-
-
-def handle_session_command(args: argparse.Namespace) -> int:
- """Handle session subcommands.
-
- Session commands manage cloud sessions and always require the cloud API.
-
- Args:
- args: Parsed command-line arguments
-
- Returns:
- Exit code (0 for success, 1 for error)
- """
- from browser_use.skill_cli.api_key import APIKeyRequired, require_api_key
- from browser_use.skill_cli.install_config import is_mode_available
-
- # Check if remote mode is available
- if not is_mode_available('remote'):
- print(
- 'Error: Session management requires remote mode.\n'
- 'Remote mode is not installed. Reinstall to enable:\n'
- ' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only\n'
- ' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full',
- file=sys.stderr,
- )
- return 1
-
- # Check API key
- try:
- require_api_key('Cloud sessions')
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if args.session_command == 'list':
- return _handle_list(args)
- elif args.session_command == 'get':
- return _handle_get(args)
- elif args.session_command == 'stop':
- return _handle_stop(args)
- elif args.session_command == 'create':
- return _handle_create(args)
- elif args.session_command == 'share':
- return _handle_share(args)
- else:
- print('Usage: browser-use session ')
- print('Commands: list, get , stop , create, share ')
- return 1
-
-
-# ============ CLI Helper Functions ============
-
-
-def _session_to_dict(session: Any) -> dict[str, Any]:
- """Convert SDK session object to dict for JSON output."""
- return {
- 'id': session.id,
- 'status': session.status,
- 'liveUrl': session.live_url,
- 'startedAt': session.started_at.isoformat() if session.started_at else None,
- 'finishedAt': session.finished_at.isoformat() if session.finished_at else None,
- 'keepAlive': session.keep_alive,
- 'persistMemory': getattr(session, 'persist_memory', None),
- 'proxyCost': session.proxy_cost,
- 'publicShareUrl': getattr(session, 'public_share_url', None),
- }
-
-
-def _handle_list(args: argparse.Namespace) -> int:
- """Handle 'session list' command."""
- try:
- status_filter = getattr(args, 'status', None)
- sessions = list_sessions(limit=args.limit, status=status_filter)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps([_session_to_dict(s) for s in sessions]))
- else:
- if not sessions:
- status_msg = f' with status "{status_filter}"' if status_filter else ''
- print(f'No sessions found{status_msg}')
- else:
- header = f'Sessions ({len(sessions)})'
- if status_filter:
- header = f'{status_filter.capitalize()} sessions ({len(sessions)})'
- print(f'{header}:')
- for s in sessions:
- session_id = s.id or 'unknown'
- status = s.status or 'unknown'
- live_url = s.live_url
- started_at = s.started_at
- finished_at = s.finished_at
- keep_alive = '🔄' if s.keep_alive else ''
-
- # Status emoji
- status_emoji = {
- 'active': '🟢',
- 'stopped': '⏹️',
- }.get(status, '❓')
-
- # Truncate ID for display
- short_id = session_id[:8] + '...' if len(session_id) > 8 else session_id
-
- # Build line with duration
- duration = format_duration(started_at, finished_at)
- line = f' {status_emoji} {short_id} [{status}]'
- if duration:
- line += f' {duration}'
- if keep_alive:
- line += f' {keep_alive}'
- if live_url and status == 'active':
- line += f'\n live: {live_url}'
- print(line)
-
- return 0
-
-
-def _handle_get(args: argparse.Namespace) -> int:
- """Handle 'session get ' command."""
- try:
- session = get_session(args.session_id)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps(_session_to_dict(session)))
- else:
- session_id = session.id or args.session_id
- status = session.status or 'unknown'
- live_url = session.live_url
- started_at = session.started_at
- finished_at = session.finished_at
- keep_alive = session.keep_alive
- proxy_cost = session.proxy_cost
- public_share_url = getattr(session, 'public_share_url', None)
-
- # Status emoji
- status_emoji = {
- 'active': '🟢',
- 'stopped': '⏹️',
- }.get(status, '❓')
-
- # Build header with duration
- duration = format_duration(started_at, finished_at)
- header_parts = [f'{status_emoji} {session_id[:8]}... [{status}]']
- if duration:
- header_parts.append(duration)
- if proxy_cost:
- # Format proxy cost to 2 decimal places
- try:
- cost_val = float(proxy_cost)
- header_parts.append(f'${cost_val:.2f}')
- except (ValueError, TypeError):
- header_parts.append(f'${proxy_cost}')
- print(' '.join(header_parts))
-
- if keep_alive:
- print(' Keep Alive: Yes')
- if live_url:
- print(f' Live URL: {live_url}')
- if public_share_url:
- print(f' Public Share: {public_share_url}')
-
- return 0
-
-
-def _handle_stop(args: argparse.Namespace) -> int:
- """Handle 'session stop ' command."""
- # Handle --all flag
- if getattr(args, 'all', False):
- return _handle_stop_all(args)
-
- try:
- stop_session(args.session_id)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps({'stopped': args.session_id}))
- else:
- print(f'Stopped session: {args.session_id}')
-
- return 0
-
-
-def _handle_stop_all(args: argparse.Namespace) -> int:
- """Handle 'session stop --all' command."""
- try:
- # Get all active sessions
- sessions = list_sessions(limit=100, status='active')
- except Exception as e:
- print(f'Error listing sessions: {e}', file=sys.stderr)
- return 1
-
- if not sessions:
- print('No active sessions to stop')
- return 0
-
- # Extract session IDs
- session_ids = [s.id for s in sessions if s.id]
-
- if not session_ids:
- print('No active sessions to stop')
- return 0
-
- # Stop all sessions in parallel
- stopped, errors = stop_sessions_parallel(session_ids)
-
- if getattr(args, 'json', False):
- print(json.dumps({'stopped': stopped, 'errors': errors}))
- else:
- if stopped:
- print(f'Stopped {len(stopped)} session(s):')
- for sid in stopped:
- print(f' ✓ {sid[:8]}...')
- if errors:
- print(f'Failed to stop {len(errors)} session(s):')
- for err in errors:
- print(f' ✗ {err["id"][:8]}...: {err["error"]}')
-
- return 0 if not errors else 1
-
-
-def _handle_create(args: argparse.Namespace) -> int:
- """Handle 'session create' command."""
- # Parse screen size if provided
- screen_width = None
- screen_height = None
- if hasattr(args, 'screen_size') and args.screen_size:
- try:
- w, h = args.screen_size.lower().split('x')
- screen_width = int(w)
- screen_height = int(h)
- except ValueError:
- print('Error: Invalid screen size format. Use WxH (e.g., 1920x1080)', file=sys.stderr)
- return 1
-
- try:
- session = create_session(
- profile_id=getattr(args, 'profile', None),
- proxy_country=getattr(args, 'proxy_country', None),
- keep_alive=getattr(args, 'keep_alive', None),
- persist_memory=getattr(args, 'persist_memory', None),
- start_url=getattr(args, 'start_url', None),
- screen_width=screen_width,
- screen_height=screen_height,
- )
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps(_session_to_dict(session)))
- else:
- print(f'Created session: {session.id}')
- if session.live_url:
- print(f' Live URL: {session.live_url}')
-
- return 0
-
-
-def _handle_share(args: argparse.Namespace) -> int:
- """Handle 'session share ' command."""
- session_id = args.session_id
-
- # Delete share if requested
- if getattr(args, 'delete', False):
- try:
- delete_public_share(session_id)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps({'deleted': session_id}))
- else:
- print(f'Deleted public share for session: {session_id}')
- return 0
-
- # Create share
- try:
- share = create_public_share(session_id)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(
- json.dumps(
- {
- 'sessionId': session_id,
- 'url': share.share_url,
- 'shareToken': share.share_token,
- 'viewCount': share.view_count,
- }
- )
- )
- else:
- print(f'Public share created for session: {session_id}')
- if share.share_url:
- print(f' URL: {share.share_url}')
-
- return 0
diff --git a/browser_use/skill_cli/commands/cloud_task.py b/browser_use/skill_cli/commands/cloud_task.py
deleted file mode 100644
index d8cac12a6..000000000
--- a/browser_use/skill_cli/commands/cloud_task.py
+++ /dev/null
@@ -1,413 +0,0 @@
-"""Cloud task SDK wrappers and CLI handlers.
-
-This module provides:
-- SDK wrapper functions for the Browser-Use Cloud Task API
-- CLI command handlers for `browser-use task `
-"""
-
-import argparse
-import json
-import logging
-import sys
-from typing import Any
-
-from browser_use_sdk.types.task_created_response import TaskCreatedResponse
-from browser_use_sdk.types.task_item_view import TaskItemView
-from browser_use_sdk.types.task_log_file_response import TaskLogFileResponse
-from browser_use_sdk.types.task_view import TaskView
-
-from browser_use.skill_cli.commands.utils import format_duration, get_sdk_client
-
-logger = logging.getLogger(__name__)
-
-
-def _filter_none(kwargs: dict[str, Any]) -> dict[str, Any]:
- """Filter out None values from kwargs (SDK passes them as null, API rejects)."""
- return {k: v for k, v in kwargs.items() if v is not None}
-
-
-# ============ SDK Wrappers ============
-
-
-def create_task(task: str, **kwargs: Any) -> TaskCreatedResponse:
- """Create a cloud task via API.
-
- Args:
- task: Task description for the agent
- llm: LLM model identifier
- session_id: Existing session ID to use
- max_steps: Maximum agent steps
- flash_mode: Enable flash mode for faster execution
- thinking: Enable extended reasoning mode
- vision: Enable/disable vision
- start_url: URL to start the task from
- metadata: Task metadata key-value pairs
- secrets: Task secrets key-value pairs
- allowed_domains: Restrict navigation to these domains
- skill_ids: Enable specific skill IDs
- structured_output: JSON schema for structured output
- judge: Enable judge mode
- judge_ground_truth: Expected answer for judge evaluation
-
- Returns:
- TaskCreatedResponse with task ID and session ID
- """
- params = _filter_none(kwargs)
- params['task'] = task
- return get_sdk_client().tasks.create_task(**params)
-
-
-def get_task(task_id: str) -> TaskView:
- """Get full task details including steps."""
- return get_sdk_client().tasks.get_task(task_id)
-
-
-def list_tasks(
- limit: int = 10,
- status: str | None = None,
- session_id: str | None = None,
-) -> list[TaskItemView]:
- """List recent tasks."""
- client = get_sdk_client()
- response = client.tasks.list_tasks(
- page_size=limit,
- **_filter_none({'filter_by': status, 'session_id': session_id}),
- )
- return list(response.items) if response.items else []
-
-
-def stop_task(task_id: str) -> TaskView:
- """Stop a running task."""
- return get_sdk_client().tasks.update_task(task_id, action='stop')
-
-
-def get_task_logs(task_id: str) -> TaskLogFileResponse:
- """Get task execution logs."""
- return get_sdk_client().tasks.get_task_logs(task_id)
-
-
-async def poll_until_complete(
- task_id: str,
- stream: bool = False,
- poll_interval: float = 1.0,
-) -> TaskView:
- """Poll task status until finished."""
- import asyncio
-
- client = get_sdk_client()
- last_status = None
-
- while True:
- # Run blocking SDK call in thread to avoid blocking event loop
- task = await asyncio.to_thread(client.tasks.get_task, task_id)
- current_status = task.status
-
- if stream and current_status != last_status:
- print(f'Status: {current_status}')
- last_status = current_status
-
- if current_status in ('finished', 'stopped', 'failed'):
- return task
-
- await asyncio.sleep(poll_interval)
-
-
-# ============ CLI Handlers ============
-
-
-def handle_task_command(args: argparse.Namespace) -> int:
- """Handle task subcommands.
-
- Task commands manage cloud tasks and always require the cloud API.
-
- Args:
- args: Parsed command-line arguments
-
- Returns:
- Exit code (0 for success, 1 for error)
- """
- from browser_use.skill_cli.api_key import APIKeyRequired, require_api_key
- from browser_use.skill_cli.install_config import is_mode_available
-
- # Check if remote mode is available
- if not is_mode_available('remote'):
- print(
- 'Error: Task management requires remote mode.\n'
- 'Remote mode is not installed. Reinstall to enable:\n'
- ' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only\n'
- ' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full',
- file=sys.stderr,
- )
- return 1
-
- # Check API key
- try:
- require_api_key('Cloud tasks')
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if args.task_command == 'list':
- return _handle_list(args)
- elif args.task_command == 'status':
- return _handle_status(args)
- elif args.task_command == 'stop':
- return _handle_stop(args)
- elif args.task_command == 'logs':
- return _handle_logs(args)
- else:
- print('Usage: browser-use task ')
- print('Commands: list, status , stop , logs ')
- return 1
-
-
-# ============ CLI Helper Functions ============
-
-
-def _task_item_to_dict(task: Any) -> dict[str, Any]:
- """Convert SDK TaskItemView to dict for JSON output."""
- return {
- 'id': task.id,
- 'status': task.status,
- 'task': task.task,
- 'sessionId': task.session_id,
- }
-
-
-def _task_to_dict(task: Any) -> dict[str, Any]:
- """Convert SDK TaskView to dict for JSON output."""
- return {
- 'id': task.id,
- 'status': task.status,
- 'task': task.task,
- 'output': task.output,
- 'cost': task.cost,
- 'sessionId': task.session_id,
- 'startedAt': task.started_at.isoformat() if task.started_at else None,
- 'finishedAt': task.finished_at.isoformat() if task.finished_at else None,
- 'steps': [_step_to_dict(s) for s in (task.steps or [])],
- }
-
-
-def _step_to_dict(step: Any) -> dict[str, Any]:
- """Convert SDK step to dict for JSON output."""
- return {
- 'number': step.number,
- 'url': step.url,
- 'memory': step.memory,
- 'actions': step.actions,
- }
-
-
-def _handle_list(args: argparse.Namespace) -> int:
- """Handle 'task list' command."""
- try:
- status_filter = getattr(args, 'status', None)
- session_filter = getattr(args, 'session', None)
- tasks = list_tasks(
- limit=args.limit,
- status=status_filter,
- session_id=session_filter,
- )
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps([_task_item_to_dict(t) for t in tasks]))
- else:
- if not tasks:
- status_msg = f' with status "{status_filter}"' if status_filter else ''
- session_msg = f' in session "{session_filter}"' if session_filter else ''
- print(f'No tasks found{status_msg}{session_msg}')
- else:
- header = f'Tasks ({len(tasks)})'
- if status_filter:
- header = f'{status_filter.capitalize()} tasks ({len(tasks)})'
- print(f'{header}:')
- for t in tasks:
- task_id = t.id or 'unknown'
- status = t.status or 'unknown'
- task_desc = t.task or ''
- # Truncate long task descriptions
- if len(task_desc) > 50:
- task_desc = task_desc[:47] + '...'
-
- # Status emoji
- status_emoji = {
- 'started': '🔄',
- 'running': '🔄',
- 'finished': '✅',
- 'stopped': '⏹️',
- 'failed': '❌',
- }.get(status, '❓')
-
- print(f' {status_emoji} {task_id[:8]}... [{status}] {task_desc}')
-
- return 0
-
-
-def _handle_status(args: argparse.Namespace) -> int:
- """Handle 'task status ' command."""
- try:
- # Use get_task() for full details including steps
- task = get_task(args.task_id)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps(_task_to_dict(task)))
- else:
- task_id = task.id or args.task_id
- task_status = task.status or 'unknown'
- output = task.output
- cost = task.cost
- steps = task.steps or []
- started_at = task.started_at
- finished_at = task.finished_at
-
- compact = getattr(args, 'compact', False)
- verbose = getattr(args, 'verbose', False)
- last_n = getattr(args, 'last', None)
- reverse = getattr(args, 'reverse', False)
- specific_step = getattr(args, 'step', None)
-
- # Determine display mode:
- # - Default: show only latest step
- # - --compact: show all steps with reasoning
- # - --verbose: show all steps with full details
- show_all_steps = compact or verbose
-
- # Status emoji
- status_emoji = {
- 'started': '🔄',
- 'running': '🔄',
- 'finished': '✅',
- 'stopped': '⏹️',
- 'failed': '❌',
- }.get(task_status, '❓')
-
- # Build header line: status, cost, duration
- parts = [f'{status_emoji} {task_id[:8]}... [{task_status}]']
- if cost is not None:
- parts.append(f'${cost}')
- duration = format_duration(started_at, finished_at)
- if duration:
- parts.append(duration)
- print(' '.join(parts))
-
- # Show steps
- if steps:
- total_steps = len(steps)
-
- # Filter to specific step if requested
- if specific_step is not None:
- steps = [s for s in steps if s.number == specific_step]
- if not steps:
- print(f' Step {specific_step} not found (task has {total_steps} steps)')
- else:
- print(f' (showing step {specific_step} of {total_steps})')
- # Display the specific step
- for step in steps:
- _print_step(step, verbose)
- elif not show_all_steps:
- # Default mode: show only the latest step
- latest_step = steps[-1]
- earlier_count = total_steps - 1
- if earlier_count > 0:
- print(f' ... {earlier_count} earlier steps')
- _print_step(latest_step, verbose=False)
- else:
- # --compact or --verbose: show all steps (with optional filters)
- skipped_earlier = 0
- if last_n is not None and last_n < total_steps:
- skipped_earlier = total_steps - last_n
- steps = steps[-last_n:]
-
- # Apply --reverse
- if reverse:
- steps = list(reversed(steps))
-
- # Show count info
- if skipped_earlier > 0:
- print(f' ... {skipped_earlier} earlier steps')
-
- # Display steps
- for step in steps:
- _print_step(step, verbose)
-
- if output:
- print(f'\nOutput: {output}')
-
- return 0
-
-
-def _print_step(step: Any, verbose: bool) -> None:
- """Print a single step in compact or verbose format."""
- step_num = step.number if step.number is not None else '?'
- memory = step.memory or ''
-
- if verbose:
- url = step.url or ''
- actions = step.actions or []
-
- # Truncate URL for display
- short_url = url[:60] + '...' if len(url) > 60 else url
-
- print(f' [{step_num}] {short_url}')
- if memory:
- # Truncate memory/reasoning for display
- short_memory = memory[:100] + '...' if len(memory) > 100 else memory
- print(f' Reasoning: {short_memory}')
- if actions:
- for action in actions[:2]: # Show max 2 actions per step
- # Truncate action for display
- short_action = action[:70] + '...' if len(action) > 70 else action
- print(f' Action: {short_action}')
- if len(actions) > 2:
- print(f' ... and {len(actions) - 2} more actions')
- else:
- # Compact mode: just step number and reasoning
- if memory:
- # Truncate reasoning for compact display
- short_memory = memory[:80] + '...' if len(memory) > 80 else memory
- print(f' {step_num}. {short_memory}')
- else:
- print(f' {step_num}. (no reasoning)')
-
-
-def _handle_stop(args: argparse.Namespace) -> int:
- """Handle 'task stop ' command."""
- try:
- stop_task(args.task_id)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps({'stopped': args.task_id}))
- else:
- print(f'Stopped task: {args.task_id}')
-
- return 0
-
-
-def _handle_logs(args: argparse.Namespace) -> int:
- """Handle 'task logs ' command."""
- try:
- result = get_task_logs(args.task_id)
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps({'downloadUrl': result.download_url}))
- else:
- download_url = result.download_url
- if download_url:
- print(f'Download logs: {download_url}')
- else:
- print('No logs available for this task')
-
- return 0
diff --git a/browser_use/skill_cli/commands/doctor.py b/browser_use/skill_cli/commands/doctor.py
index b1cca0cf1..bf6a190c9 100644
--- a/browser_use/skill_cli/commands/doctor.py
+++ b/browser_use/skill_cli/commands/doctor.py
@@ -9,8 +9,6 @@ from typing import Any
logger = logging.getLogger(__name__)
-COMMANDS = {'doctor'}
-
async def handle() -> dict[str, Any]:
"""Run health checks and return results."""
@@ -22,14 +20,14 @@ async def handle() -> dict[str, Any]:
# 2. Browser availability
checks['browser'] = _check_browser()
- # 3. API key configuration
- checks['api_key'] = _check_api_key_config()
+ # 3. Network connectivity (basic check)
+ checks['network'] = await _check_network()
- # 4. Cloudflared availability
+ # 4. Optional: cloudflared (for browser-use tunnel)
checks['cloudflared'] = _check_cloudflared()
- # 5. Network connectivity (basic check)
- checks['network'] = await _check_network()
+ # 5. Optional: profile-use (for browser-use profile)
+ checks['profile_use'] = _check_profile_use()
# Determine overall status
all_ok = all(check.get('status') == 'ok' for check in checks.values())
@@ -64,8 +62,7 @@ def _check_browser() -> dict[str, Any]:
try:
from browser_use.browser.profile import BrowserProfile
- # Just check if we can import and create a profile
- profile = BrowserProfile(headless=True)
+ BrowserProfile(headless=True) # verify import + constructor work
return {
'status': 'ok',
'message': 'Browser profile available',
@@ -78,45 +75,6 @@ def _check_browser() -> dict[str, Any]:
}
-def _check_api_key_config() -> dict[str, Any]:
- """Check if API key is configured."""
- from browser_use.skill_cli.api_key import check_api_key
-
- status = check_api_key()
- if status['available']:
- return {
- 'status': 'ok',
- 'message': f'API key configured ({status["source"]})',
- }
- else:
- return {
- 'status': 'missing',
- 'message': 'No API key configured',
- 'note': 'Required for remote browser. Get one at https://browser-use.com/new-api-key',
- }
-
-
-def _check_cloudflared() -> dict[str, Any]:
- """Check if cloudflared is available."""
- from browser_use.skill_cli.tunnel import get_tunnel_manager
-
- tunnel_mgr = get_tunnel_manager()
- status_info = tunnel_mgr.get_status()
-
- if status_info['available']:
- return {
- 'status': 'ok',
- 'message': f'Cloudflared available ({status_info["source"]})',
- 'note': status_info.get('note'),
- }
- else:
- return {
- 'status': 'missing',
- 'message': 'Cloudflared not available',
- 'note': 'Will be auto-installed on first tunnel use',
- }
-
-
async def _check_network() -> dict[str, Any]:
"""Check basic network connectivity."""
try:
@@ -140,6 +98,40 @@ async def _check_network() -> dict[str, Any]:
}
+def _check_cloudflared() -> dict[str, Any]:
+ """Check if cloudflared is available (needed for browser-use tunnel)."""
+ from browser_use.skill_cli.tunnel import get_tunnel_manager
+
+ status = get_tunnel_manager().get_status()
+ if status['available']:
+ return {
+ 'status': 'ok',
+ 'message': f'cloudflared installed ({status["path"]})',
+ }
+ return {
+ 'status': 'missing',
+ 'message': 'cloudflared not installed (needed for browser-use tunnel)',
+ 'fix': 'Install cloudflared: https://developers.cloudflare.com/cloudflare-one/connections/connect-networks/downloads/',
+ }
+
+
+def _check_profile_use() -> dict[str, Any]:
+ """Check if profile-use binary is available (needed for browser-use profile)."""
+ from browser_use.skill_cli.profile_use import get_profile_use_binary
+
+ binary = get_profile_use_binary()
+ if binary:
+ return {
+ 'status': 'ok',
+ 'message': f'profile-use installed ({binary})',
+ }
+ return {
+ 'status': 'missing',
+ 'message': 'profile-use not installed (needed for browser-use profile)',
+ 'fix': 'browser-use profile update',
+ }
+
+
def _summarize_checks(checks: dict[str, dict[str, Any]]) -> str:
"""Generate a summary of check results."""
ok = sum(1 for c in checks.values() if c.get('status') == 'ok')
diff --git a/browser_use/skill_cli/commands/profile.py b/browser_use/skill_cli/commands/profile.py
deleted file mode 100644
index 5b076fabc..000000000
--- a/browser_use/skill_cli/commands/profile.py
+++ /dev/null
@@ -1,703 +0,0 @@
-"""Profile management command handlers.
-
-Unified profile management that works with both local Chrome profiles and cloud profiles.
-The behavior is determined by the browser mode (-b real or -b remote).
-"""
-
-import argparse
-import json
-import logging
-import sys
-import tempfile
-from pathlib import Path
-from typing import Any, Literal
-
-from browser_use.skill_cli.commands.utils import get_sdk_client
-
-logger = logging.getLogger(__name__)
-
-
-ProfileMode = Literal['real', 'remote']
-
-
-class ProfileModeError(Exception):
- """Raised when profile mode cannot be determined or is invalid."""
-
- pass
-
-
-def get_profile_mode(args: argparse.Namespace) -> ProfileMode:
- """Determine profile mode from -b flag or install config.
-
- Args:
- args: Parsed command-line arguments with browser attribute
-
- Returns:
- 'real' for local Chrome profiles, 'remote' for cloud profiles
-
- Raises:
- ProfileModeError: If mode cannot be determined or chromium mode is used
- """
- from browser_use.skill_cli.install_config import is_mode_available
-
- browser_mode = getattr(args, 'browser', None)
-
- # Explicit mode specified
- if browser_mode == 'real':
- return 'real'
- elif browser_mode == 'remote':
- return 'remote'
- elif browser_mode == 'chromium':
- raise ProfileModeError(
- 'Profile commands are not supported in chromium mode.\n'
- 'Use -b real for local Chrome profiles or -b remote for cloud profiles.'
- )
-
- # No explicit mode - try to infer from install config
- local_available = is_mode_available('real')
- remote_available = is_mode_available('remote')
-
- if local_available and not remote_available:
- return 'real'
- elif remote_available and not local_available:
- return 'remote'
- elif local_available and remote_available:
- raise ProfileModeError(
- 'Both local and remote modes are available.\n'
- 'Specify -b real for local Chrome profiles or -b remote for cloud profiles.'
- )
- else:
- raise ProfileModeError('No profile modes available. Run browser-use setup first.')
-
-
-def handle_profile_command(args: argparse.Namespace) -> int:
- """Handle profile subcommands.
-
- Routes to local or cloud implementation based on browser mode.
- """
- command = args.profile_command
-
- # Commands that don't need mode inference
- if command is None:
- _print_usage()
- return 1
-
- # For sync command, we need special handling (local → cloud)
- if command == 'sync':
- return _handle_sync(args)
-
- # Get profile mode for all other commands
- try:
- mode = get_profile_mode(args)
- except ProfileModeError as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- # Route to appropriate handler
- if command == 'list':
- return _handle_list(args, mode)
- elif command == 'get':
- return _handle_get(args, mode)
- elif command == 'create':
- return _handle_create(args, mode)
- elif command == 'update':
- return _handle_update(args, mode)
- elif command == 'delete':
- return _handle_delete(args, mode)
- elif command == 'cookies':
- return _handle_cookies(args, mode)
- else:
- _print_usage()
- return 1
-
-
-def _print_usage() -> None:
- """Print profile command usage."""
- print('Usage: browser-use [-b real|remote] profile ')
- print()
- print('Commands:')
- print(' list List profiles')
- print(' get Get profile details')
- print(' create Create a new profile (remote only)')
- print(' update Update profile')
- print(' delete Delete profile')
- print(' cookies Show cookies by domain (real only)')
- print(' sync Sync local profile to cloud')
- print()
- print('The -b flag determines which profile system to use:')
- print(' -b real Local Chrome profiles')
- print(' -b remote Cloud profiles (requires API key)')
-
-
-# -----------------------------------------------------------------------------
-# List profiles
-# -----------------------------------------------------------------------------
-
-
-def _handle_list(args: argparse.Namespace, mode: ProfileMode) -> int:
- """Handle 'profile list' command."""
- if mode == 'real':
- return _list_local_profiles(args)
- else:
- return _list_cloud_profiles(args)
-
-
-def _list_local_profiles(args: argparse.Namespace) -> int:
- """List local Chrome profiles."""
- profiles = list_local_chrome_profiles()
-
- if getattr(args, 'json', False):
- print(json.dumps({'profiles': profiles}))
- else:
- if profiles:
- print('Local Chrome profiles:')
- for p in profiles:
- print(f' {p["id"]}: {p["name"]} ({p["email"]})')
- else:
- print('No Chrome profiles found')
-
- return 0
-
-
-def _list_cloud_profiles(args: argparse.Namespace) -> int:
- """List cloud profiles."""
- from browser_use.skill_cli.api_key import APIKeyRequired
-
- page = getattr(args, 'page', 1)
- page_size = getattr(args, 'page_size', 20)
-
- try:
- client = get_sdk_client()
- response = client.profiles.list_profiles(page_number=page, page_size=page_size)
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- # Convert to dict for JSON output
- data = {
- 'items': [{'id': p.id, 'name': p.name} for p in response.items],
- 'totalItems': response.total_items,
- 'pageNumber': response.page_number,
- 'pageSize': response.page_size,
- }
- print(json.dumps(data))
- else:
- if response.items:
- print(f'Cloud profiles ({len(response.items)}/{response.total_items}):')
- for p in response.items:
- name = p.name or 'Unnamed'
- print(f' {p.id}: {name}')
- else:
- print('No cloud profiles found')
-
- return 0
-
-
-# -----------------------------------------------------------------------------
-# Get profile
-# -----------------------------------------------------------------------------
-
-
-def _handle_get(args: argparse.Namespace, mode: ProfileMode) -> int:
- """Handle 'profile get ' command."""
- if mode == 'real':
- return _get_local_profile(args)
- else:
- return _get_cloud_profile(args)
-
-
-def _get_local_profile(args: argparse.Namespace) -> int:
- """Get local Chrome profile details."""
- profiles = list_local_chrome_profiles()
- profile_id = args.id
-
- for p in profiles:
- if p['id'] == profile_id or p['name'] == profile_id:
- if getattr(args, 'json', False):
- print(json.dumps(p))
- else:
- print(f'Profile: {p["id"]}')
- print(f' Name: {p["name"]}')
- print(f' Email: {p["email"]}')
- return 0
-
- print(f'Error: Profile "{profile_id}" not found', file=sys.stderr)
- return 1
-
-
-def _get_cloud_profile(args: argparse.Namespace) -> int:
- """Get cloud profile details."""
- from browser_use.skill_cli.api_key import APIKeyRequired
-
- try:
- client = get_sdk_client()
- profile = client.profiles.get_profile(args.id)
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- data = {
- 'id': profile.id,
- 'name': profile.name,
- 'createdAt': profile.created_at.isoformat() if profile.created_at else None,
- 'updatedAt': profile.updated_at.isoformat() if profile.updated_at else None,
- }
- print(json.dumps(data))
- else:
- print(f'Profile: {profile.id}')
- if profile.name:
- print(f' Name: {profile.name}')
- if profile.created_at:
- print(f' Created: {profile.created_at.isoformat()}')
- if profile.updated_at:
- print(f' Updated: {profile.updated_at.isoformat()}')
-
- return 0
-
-
-# -----------------------------------------------------------------------------
-# Create profile
-# -----------------------------------------------------------------------------
-
-
-def _handle_create(args: argparse.Namespace, mode: ProfileMode) -> int:
- """Handle 'profile create' command."""
- if mode == 'real':
- print('Error: Cannot create local Chrome profiles via CLI.', file=sys.stderr)
- print('Use Chrome browser to create new profiles.', file=sys.stderr)
- return 1
-
- return _create_cloud_profile(args)
-
-
-def _create_cloud_profile(args: argparse.Namespace) -> int:
- """Create a cloud profile."""
- from browser_use.skill_cli.api_key import APIKeyRequired
-
- try:
- client = get_sdk_client()
- params = {}
- if args.name:
- params['name'] = args.name
- profile = client.profiles.create_profile(**params)
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps({'id': profile.id, 'name': profile.name}))
- else:
- print(f'Created profile: {profile.id}')
-
- return 0
-
-
-# -----------------------------------------------------------------------------
-# Update profile
-# -----------------------------------------------------------------------------
-
-
-def _handle_update(args: argparse.Namespace, mode: ProfileMode) -> int:
- """Handle 'profile update ' command."""
- if mode == 'real':
- print('Error: Cannot update local Chrome profiles via CLI.', file=sys.stderr)
- print('Use Chrome browser settings to update profiles.', file=sys.stderr)
- return 1
-
- return _update_cloud_profile(args)
-
-
-def _update_cloud_profile(args: argparse.Namespace) -> int:
- """Update a cloud profile."""
- from browser_use.skill_cli.api_key import APIKeyRequired
-
- try:
- client = get_sdk_client()
- params = {}
- if args.name:
- params['name'] = args.name
- profile = client.profiles.update_profile(args.id, **params)
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps({'id': profile.id, 'name': profile.name}))
- else:
- print(f'Updated profile: {profile.id}')
-
- return 0
-
-
-# -----------------------------------------------------------------------------
-# Delete profile
-# -----------------------------------------------------------------------------
-
-
-def _handle_delete(args: argparse.Namespace, mode: ProfileMode) -> int:
- """Handle 'profile delete ' command."""
- if mode == 'real':
- print('Error: Cannot delete local Chrome profiles via CLI.', file=sys.stderr)
- print('Use Chrome browser settings to remove profiles.', file=sys.stderr)
- return 1
-
- return _delete_cloud_profile(args)
-
-
-def _delete_cloud_profile(args: argparse.Namespace) -> int:
- """Delete a cloud profile."""
- from browser_use.skill_cli.api_key import APIKeyRequired
-
- try:
- client = get_sdk_client()
- client.profiles.delete_browser_profile(args.id)
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- if getattr(args, 'json', False):
- print(json.dumps({'deleted': args.id}))
- else:
- print(f'Deleted profile: {args.id}')
-
- return 0
-
-
-# -----------------------------------------------------------------------------
-# Cookies (local only)
-# -----------------------------------------------------------------------------
-
-
-def _handle_cookies(args: argparse.Namespace, mode: ProfileMode) -> int:
- """Handle 'profile cookies ' command."""
- if mode == 'remote':
- print('Error: Cookie listing is only available for local Chrome profiles.', file=sys.stderr)
- print('Use -b real to access local profile cookies.', file=sys.stderr)
- return 1
-
- return _list_profile_cookies(args)
-
-
-def _list_profile_cookies(args: argparse.Namespace) -> int:
- """List cookies by domain in a local Chrome profile."""
- import asyncio
-
- from browser_use.skill_cli.sessions import create_browser_session
-
- # Get local profiles
- local_profiles = list_local_chrome_profiles()
- if not local_profiles:
- print('Error: No local Chrome profiles found', file=sys.stderr)
- return 1
-
- # Find the matching profile
- profile_arg = args.id
- selected_profile = None
- for p in local_profiles:
- if p['id'] == profile_arg or p['name'] == profile_arg:
- selected_profile = p
- break
-
- if not selected_profile:
- print(f'Error: Profile "{profile_arg}" not found', file=sys.stderr)
- print('Available profiles:')
- for p in local_profiles:
- print(f' {p["id"]}: {p["name"]}')
- return 1
-
- profile_id = selected_profile['id']
- print(f'Loading cookies from: {selected_profile["name"]} ({selected_profile["email"]})')
-
- async def get_cookies():
- local_session = await create_browser_session('real', headed=False, profile=profile_id)
- await local_session.start()
- try:
- cookies = await local_session._cdp_get_cookies()
- return cookies
- finally:
- await local_session.kill()
-
- try:
- cookies = asyncio.get_event_loop().run_until_complete(get_cookies())
- except RuntimeError:
- cookies = asyncio.run(get_cookies())
-
- # Group cookies by domain
- domains: dict[str, int] = {}
- for cookie in cookies:
- domain = cookie.get('domain', 'unknown')
- # Normalize domain (remove leading dot)
- if domain.startswith('.'):
- domain = domain[1:]
- domains[domain] = domains.get(domain, 0) + 1
-
- # Sort by count descending
- sorted_domains = sorted(domains.items(), key=lambda x: x[1], reverse=True)
-
- if getattr(args, 'json', False):
- print(json.dumps({'domains': dict(sorted_domains), 'total_cookies': len(cookies)}))
- else:
- print(f'\nCookies by domain ({len(cookies)} total):')
- for domain, count in sorted_domains[:20]: # Show top 20
- print(f' {domain}: {count}')
- if len(sorted_domains) > 20:
- print(f' ... and {len(sorted_domains) - 20} more domains')
-
- print('\nTo sync cookies to cloud:')
- print(f' browser-use profile sync --from "{profile_id}" --domain ')
-
- return 0
-
-
-# -----------------------------------------------------------------------------
-# Sync (local → cloud)
-# -----------------------------------------------------------------------------
-
-
-def _handle_sync(args: argparse.Namespace) -> int:
- """Handle 'profile sync' command - sync local profile to cloud."""
- import asyncio
-
- from browser_use.skill_cli.api_key import APIKeyRequired
- from browser_use.skill_cli.sessions import create_browser_session
-
- # Get SDK client (validates API key)
- try:
- client = get_sdk_client()
- except APIKeyRequired as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
-
- # Get local profiles
- local_profiles = list_local_chrome_profiles()
- if not local_profiles:
- print('Error: No local Chrome profiles found', file=sys.stderr)
- return 1
-
- # Determine which profile to sync
- from_profile = args.from_profile
- if not from_profile:
- # Show available profiles and ask user to specify
- print('Available local profiles:')
- for p in local_profiles:
- print(f' {p["id"]}: {p["name"]} ({p["email"]})')
- print()
- print('Use --from to specify a profile:')
- print(' browser-use profile sync --from "Default"')
- print(' browser-use profile sync --from "Profile 1"')
- return 1
-
- # Find the matching profile
- selected_profile = None
- for p in local_profiles:
- if p['id'] == from_profile or p['name'] == from_profile:
- selected_profile = p
- break
-
- if not selected_profile:
- print(f'Error: Profile "{from_profile}" not found', file=sys.stderr)
- print('Available profiles:')
- for p in local_profiles:
- print(f' {p["id"]}: {p["name"]}')
- return 1
-
- profile_id = selected_profile['id']
- profile_name = selected_profile['name']
- domain_filter = getattr(args, 'domain', None)
-
- # Generate cloud profile name
- cloud_name = args.name if args.name else None
- if not cloud_name:
- if domain_filter:
- cloud_name = f'Chrome - {profile_name} ({domain_filter})'
- else:
- cloud_name = f'Chrome - {profile_name}'
-
- # Use stderr for progress when JSON output is requested
- json_output = getattr(args, 'json', False)
- out = sys.stderr if json_output else sys.stdout
-
- def log(msg: str) -> None:
- print(msg, file=out)
-
- if domain_filter:
- log(f'Syncing: {profile_name} → {domain_filter} cookies only')
- else:
- log(f'Syncing: {profile_name} ({selected_profile["email"]})')
-
- # Step 1: Create cloud profile
- log(' Creating cloud profile...')
- try:
- cloud_profile = client.profiles.create_profile(name=cloud_name)
- cloud_profile_id = cloud_profile.id
- except Exception as e:
- print(f'Error creating cloud profile: {e}', file=sys.stderr)
- return 1
-
- log(f' ✓ Created: {cloud_profile_id}')
-
- def cleanup_cloud_profile() -> None:
- """Delete the cloud profile on failure."""
- try:
- client.profiles.delete_browser_profile(cloud_profile_id)
- except Exception:
- pass
-
- # Step 2: Export cookies from local profile
- async def sync_cookies():
- log(' Exporting cookies from local profile...')
- local_session = await create_browser_session('real', headed=False, profile=profile_id)
- await local_session.start()
- try:
- cookies = await local_session._cdp_get_cookies()
- if not cookies:
- return 0, 'No cookies found in local profile'
-
- # Filter by domain if specified
- if domain_filter:
- cookies = [c for c in cookies if domain_filter in c.get('domain', '')]
-
- if not cookies:
- return 0, f'No cookies found for domain: {domain_filter}'
-
- log(f' ✓ Found {len(cookies)} cookies')
-
- # Save to temp file - convert Cookie objects to dicts for JSON serialization
- cookies_file = Path(tempfile.gettempdir()) / f'browser-use-sync-{cloud_profile_id}.json'
- cookies_data = [dict(c) if hasattr(c, '__dict__') else c for c in cookies]
- cookies_file.write_text(json.dumps(cookies_data))
-
- return len(cookies), str(cookies_file)
- finally:
- await local_session.kill()
-
- try:
- loop = asyncio.get_event_loop()
- if loop.is_running():
- import concurrent.futures
-
- with concurrent.futures.ThreadPoolExecutor() as executor:
- future = executor.submit(asyncio.run, sync_cookies())
- cookie_count, cookies_file = future.result()
- else:
- cookie_count, cookies_file = loop.run_until_complete(sync_cookies())
- except RuntimeError:
- cookie_count, cookies_file = asyncio.run(sync_cookies())
-
- if cookie_count == 0:
- log(f' ⚠ {cookies_file}') # cookies_file contains error message
- cleanup_cloud_profile()
- return 1
-
- # Step 3: Import cookies to cloud profile
- async def import_to_cloud():
- log(' Importing cookies to cloud profile...')
- remote_session = await create_browser_session('remote', headed=False, profile=cloud_profile_id)
- await remote_session.start()
- try:
- cookies = json.loads(Path(cookies_file).read_text())
- await remote_session._cdp_set_cookies(cookies)
- return True
- finally:
- await remote_session.kill()
-
- try:
- loop = asyncio.get_event_loop()
- if loop.is_running():
- import concurrent.futures
-
- with concurrent.futures.ThreadPoolExecutor() as executor:
- future = executor.submit(asyncio.run, import_to_cloud())
- future.result()
- else:
- loop.run_until_complete(import_to_cloud())
- except RuntimeError:
- asyncio.run(import_to_cloud())
- except Exception as e:
- log(f' ⚠ Failed to import cookies: {e}')
- cleanup_cloud_profile()
- return 1
-
- # Cleanup temp file
- try:
- Path(cookies_file).unlink()
- except Exception:
- pass
-
- log('✓ Profile synced successfully!')
- log(f' Cloud profile ID: {cloud_profile_id}')
- log('')
- log('To use this profile:')
- log(f' browser-use -b remote --profile {cloud_profile_id} open ')
-
- if json_output:
- print(
- json.dumps(
- {
- 'success': True,
- 'profile_id': cloud_profile_id,
- 'cookies_synced': cookie_count,
- }
- )
- )
-
- return 0
-
-
-# -----------------------------------------------------------------------------
-# Helpers
-# -----------------------------------------------------------------------------
-
-
-def list_local_chrome_profiles() -> list[dict[str, Any]]:
- """List local Chrome profiles from the Local State file."""
- import platform
-
- # Find Chrome Local State file
- system = platform.system()
- if system == 'Darwin':
- local_state = Path.home() / 'Library/Application Support/Google/Chrome/Local State'
- elif system == 'Windows':
- local_state = Path.home() / 'AppData/Local/Google/Chrome/User Data/Local State'
- else:
- local_state = Path.home() / '.config/google-chrome/Local State'
-
- if not local_state.exists():
- return []
-
- try:
- data = json.loads(local_state.read_text())
- profiles_info = data.get('profile', {}).get('info_cache', {})
-
- profiles = []
- for profile_id, info in profiles_info.items():
- profiles.append(
- {
- 'id': profile_id,
- 'name': info.get('name', profile_id),
- 'email': info.get('user_name', ''),
- }
- )
- return profiles
- except Exception:
- return []
diff --git a/browser_use/skill_cli/commands/python_exec.py b/browser_use/skill_cli/commands/python_exec.py
index 770ff761e..5bb001527 100644
--- a/browser_use/skill_cli/commands/python_exec.py
+++ b/browser_use/skill_cli/commands/python_exec.py
@@ -49,7 +49,7 @@ async def handle(session: SessionInfo, params: dict[str, Any]) -> Any:
# Execute code in a thread pool so browser operations can schedule back to the event loop
loop = asyncio.get_running_loop()
- result = await loop.run_in_executor(None, python_session.execute, code, browser_session, loop)
+ result = await loop.run_in_executor(None, python_session.execute, code, browser_session, loop, session.actions)
if result.success:
# Return raw text output for clean display
diff --git a/browser_use/skill_cli/commands/session.py b/browser_use/skill_cli/commands/session.py
deleted file mode 100644
index eb05d4ecc..000000000
--- a/browser_use/skill_cli/commands/session.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Session management command handlers."""
-
-import logging
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
- from browser_use.skill_cli.sessions import SessionRegistry
-
-logger = logging.getLogger(__name__)
-
-COMMANDS = {'sessions', 'close'}
-
-
-async def handle(action: str, session_name: str, registry: 'SessionRegistry', params: dict[str, Any]) -> Any:
- """Handle session management command."""
- if action == 'sessions':
- sessions = registry.list_sessions()
- return {
- 'sessions': sessions,
- 'count': len(sessions),
- }
-
- elif action == 'close':
- if params.get('all'):
- # Close all sessions and signal shutdown
- sessions = registry.list_sessions()
- await registry.close_all()
- return {
- 'closed': [s['name'] for s in sessions],
- 'count': len(sessions),
- '_shutdown': True, # Signal to stop server
- }
- else:
- # Close this server's session and shutdown
- await registry.close_session(session_name)
- return {'closed': session_name, '_shutdown': True}
-
- raise ValueError(f'Unknown session action: {action}')
diff --git a/browser_use/skill_cli/commands/setup.py b/browser_use/skill_cli/commands/setup.py
index e60f7bbf5..c56d11a82 100644
--- a/browser_use/skill_cli/commands/setup.py
+++ b/browser_use/skill_cli/commands/setup.py
@@ -1,330 +1,253 @@
-"""Setup command - configure browser-use for first-time use.
+"""Setup command — post-install setup for browser-use CLI.
-Handles dependency installation and configuration with mode-based
-setup (local/remote/full) and optional automatic fixes.
+Covers everything install.sh does after the package is installed:
+home directory, config file, Chromium, profile-use, cloudflared.
+Interactive by default, --yes for CI.
"""
-import logging
-from typing import Any, Literal
-
-logger = logging.getLogger(__name__)
-
-COMMANDS = {'setup'}
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
-async def handle(
- action: str,
- params: dict[str, Any],
-) -> dict[str, Any]:
- """Handle setup command."""
- assert action == 'setup'
-
- mode: Literal['local', 'remote', 'full'] = params.get('mode', 'local')
- yes: bool = params.get('yes', False)
- api_key: str | None = params.get('api_key')
- json_output: bool = params.get('json', False)
-
- # Validate mode
- if mode not in ('local', 'remote', 'full'):
- return {'error': f'Invalid mode: {mode}. Must be local, remote, or full'}
-
- # Run setup flow
+def _prompt(message: str, yes: bool) -> bool:
+ """Prompt user for confirmation. Returns True if --yes or user says yes."""
+ if yes:
+ return True
try:
- checks = await run_checks(mode)
-
- if not json_output:
- _log_checks(checks)
-
- # Plan actions
- actions = plan_actions(checks, mode, yes, api_key)
-
- if not json_output:
- _log_actions(actions)
-
- # Execute actions
- await execute_actions(actions, mode, api_key, json_output)
-
- # Validate
- validation = await validate_setup(mode)
-
- if not json_output:
- _log_validation(validation)
-
- return {
- 'status': 'success',
- 'mode': mode,
- 'checks': checks,
- 'validation': validation,
- }
-
- except Exception as e:
- logger.exception(f'Setup failed: {e}')
- error_msg = str(e)
- if json_output:
- return {'error': error_msg}
- return {'error': error_msg}
+ reply = input(f' {message} [Y/n] ').strip().lower()
+ return reply in ('', 'y', 'yes')
+ except (EOFError, KeyboardInterrupt):
+ print()
+ return False
-async def run_checks(mode: Literal['local', 'remote', 'full']) -> dict[str, Any]:
- """Run pre-flight checks without making changes.
+def handle(yes: bool = False) -> dict:
+ """Run interactive setup."""
+ from browser_use.skill_cli.utils import get_home_dir
- Returns:
- Dict mapping check names to their status
- """
- checks: dict[str, Any] = {}
+ home_dir = get_home_dir()
+ results: dict = {}
+ step = 0
+ total = 6
- # Package check
- try:
- import browser_use
+ print('\nBrowser-Use Setup')
+ print('━━━━━━━━━━━━━━━━━\n')
- checks['browser_use_package'] = {
- 'status': 'ok',
- 'message': f'browser-use {browser_use.__version__}'
- if hasattr(browser_use, '__version__')
- else 'browser-use installed',
- }
- except ImportError:
- checks['browser_use_package'] = {
- 'status': 'error',
- 'message': 'browser-use not installed',
- }
+ # Step 1: Home directory
+ step += 1
+ print(f'Step {step}/{total}: Home directory')
+ if home_dir.exists():
+ print(f' ✓ {home_dir} exists')
+ else:
+ home_dir.mkdir(parents=True, exist_ok=True)
+ print(f' ✓ {home_dir} created')
+ results['home_dir'] = 'ok'
- # Browser check (local and full modes)
- if mode in ('local', 'full'):
- checks['browser'] = await _check_browser()
-
- # API key check (remote and full modes)
- if mode in ('remote', 'full'):
- from browser_use.skill_cli.api_key import check_api_key
-
- api_status = check_api_key()
- if api_status['available']:
- checks['api_key'] = {
- 'status': 'ok',
- 'message': f'Configured via {api_status["source"]} ({api_status["key_prefix"]}...)',
- }
- else:
- checks['api_key'] = {
- 'status': 'missing',
- 'message': 'Not configured',
- }
-
- # Cloudflared check (remote and full modes)
- if mode in ('remote', 'full'):
- from browser_use.skill_cli.tunnel import get_tunnel_manager
-
- tunnel_mgr = get_tunnel_manager()
- status = tunnel_mgr.get_status()
- checks['cloudflared'] = {
- 'status': 'ok' if status['available'] else 'missing',
- 'message': status['note'],
- }
-
- return checks
-
-
-async def _check_browser() -> dict[str, Any]:
- """Check if browser is available."""
- try:
- from browser_use.browser.profile import BrowserProfile
-
- profile = BrowserProfile(headless=True)
- # Just check if we can create a session without actually launching
- return {
- 'status': 'ok',
- 'message': 'Browser available',
- }
- except Exception as e:
- return {
- 'status': 'error',
- 'message': f'Browser check failed: {e}',
- }
-
-
-def plan_actions(
- checks: dict[str, Any],
- mode: Literal['local', 'remote', 'full'],
- yes: bool,
- api_key: str | None,
-) -> list[dict[str, Any]]:
- """Plan which actions to take based on checks.
-
- Returns:
- List of actions to execute
- """
- actions: list[dict[str, Any]] = []
-
- # Browser installation (local/full)
- if mode in ('local', 'full'):
- browser_check = checks.get('browser', {})
- if browser_check.get('status') != 'ok':
- actions.append(
- {
- 'type': 'install_browser',
- 'description': 'Install browser (Chromium)',
- 'required': True,
- }
- )
-
- # API key configuration (remote/full)
- if mode in ('remote', 'full'):
- api_check = checks.get('api_key', {})
- if api_check.get('status') != 'ok':
- if api_key:
- actions.append(
- {
- 'type': 'configure_api_key',
- 'description': 'Configure API key',
- 'required': True,
- 'api_key': api_key,
- }
- )
- elif not yes:
- actions.append(
- {
- 'type': 'prompt_api_key',
- 'description': 'Prompt for API key',
- 'required': False,
- }
- )
-
- # Cloudflared (remote/full)
- if mode in ('remote', 'full'):
- cloudflared_check = checks.get('cloudflared', {})
- if cloudflared_check.get('status') != 'ok':
- actions.append(
- {
- 'type': 'install_cloudflared',
- 'description': 'Install cloudflared (for tunneling)',
- 'required': True,
- }
- )
-
- return actions
-
-
-async def execute_actions(
- actions: list[dict[str, Any]],
- mode: Literal['local', 'remote', 'full'],
- api_key: str | None,
- json_output: bool,
-) -> None:
- """Execute planned actions.
-
- Args:
- actions: List of actions to execute
- mode: Setup mode (local/remote/full)
- api_key: Optional API key to configure
- json_output: Whether to output JSON
- """
- for action in actions:
- action_type = action['type']
-
- if action_type == 'install_browser':
- if not json_output:
- print('📦 Installing Chromium browser (~300MB)...')
- # Browser will be installed on first use by Playwright
- if not json_output:
- print('✓ Browser available (will be installed on first use)')
-
- elif action_type == 'configure_api_key':
- if not json_output:
- print('🔑 Configuring API key...')
- from browser_use.skill_cli.api_key import save_api_key
-
- if api_key:
- save_api_key(api_key)
- if not json_output:
- print('✓ API key configured')
-
- elif action_type == 'prompt_api_key':
- if not json_output:
- print('🔑 API key not configured')
- print(' Set via: export BROWSER_USE_API_KEY=your_key')
- print(' Or: browser-use setup --api-key ')
-
- elif action_type == 'install_cloudflared':
- if not json_output:
- print('⚠ cloudflared not installed')
- print(' Install via:')
- print(' macOS: brew install cloudflared')
- print(
- ' Linux: curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o ~/.local/bin/cloudflared && chmod +x ~/.local/bin/cloudflared'
- )
- print(' Windows: winget install Cloudflare.cloudflared')
- print()
- print(' Or re-run install.sh which installs cloudflared automatically.')
-
-
-async def validate_setup(
- mode: Literal['local', 'remote', 'full'],
-) -> dict[str, Any]:
- """Validate that setup worked.
-
- Returns:
- Dict with validation results
- """
- results: dict[str, Any] = {}
-
- # Check imports
- try:
- import browser_use # noqa: F401
-
- results['browser_use_import'] = 'ok'
- except ImportError:
- results['browser_use_import'] = 'failed'
-
- # Validate mode requirements
- if mode in ('local', 'full'):
+ # Step 2: Config file
+ step += 1
+ config_path = home_dir / 'config.json'
+ print(f'\nStep {step}/{total}: Config file')
+ if config_path.exists():
+ print(f' ✓ {config_path} exists')
+ else:
+ config_path.write_text('{}\n')
try:
- from browser_use.browser.profile import BrowserProfile
+ config_path.chmod(0o600)
+ except OSError:
+ pass
+ print(f' ✓ {config_path} created')
+ results['config'] = 'ok'
- browser_profile = BrowserProfile(headless=True)
- results['browser_available'] = 'ok'
- except Exception as e:
- results['browser_available'] = f'failed: {e}'
+ # Step 3: Chromium browser
+ step += 1
+ print(f'\nStep {step}/{total}: Chromium browser')
+ chromium_installed = _check_chromium()
+ if chromium_installed:
+ print(' ✓ Chromium already installed')
+ results['chromium'] = 'ok'
+ else:
+ if _prompt('Chromium is not installed (~300MB download). Install now?', yes):
+ print(' ℹ Installing Chromium...')
+ if _install_chromium():
+ print(' ✓ Chromium installed')
+ results['chromium'] = 'ok'
+ else:
+ print(' ✗ Chromium installation failed')
+ results['chromium'] = 'failed'
+ else:
+ print(' ○ Skipped')
+ results['chromium'] = 'skipped'
- if mode in ('remote', 'full'):
- from browser_use.skill_cli.api_key import check_api_key
- from browser_use.skill_cli.tunnel import get_tunnel_manager
+ # Step 4: Profile-use binary
+ step += 1
+ print(f'\nStep {step}/{total}: Profile-use binary')
+ from browser_use.skill_cli.profile_use import get_profile_use_binary
- api_check = check_api_key()
- results['api_key_available'] = api_check['available']
+ if get_profile_use_binary():
+ print(' ✓ profile-use already installed')
+ results['profile_use'] = 'ok'
+ else:
+ if _prompt('profile-use is not installed (needed for browser-use profile). Install now?', yes):
+ print(' ℹ Downloading profile-use...')
+ if _install_profile_use():
+ print(' ✓ profile-use installed')
+ results['profile_use'] = 'ok'
+ else:
+ print(' ✗ profile-use installation failed')
+ results['profile_use'] = 'failed'
+ else:
+ print(' ○ Skipped')
+ results['profile_use'] = 'skipped'
- tunnel_mgr = get_tunnel_manager()
- results['cloudflared_available'] = tunnel_mgr.is_available()
+ # Step 5: Cloudflared
+ step += 1
+ print(f'\nStep {step}/{total}: Cloudflare tunnel (cloudflared)')
+ if shutil.which('cloudflared'):
+ print(' ✓ cloudflared already installed')
+ results['cloudflared'] = 'ok'
+ else:
+ if _prompt('cloudflared is not installed (needed for browser-use tunnel). Install now?', yes):
+ print(' ℹ Installing cloudflared...')
+ if _install_cloudflared():
+ print(' ✓ cloudflared installed')
+ results['cloudflared'] = 'ok'
+ else:
+ print(' ✗ cloudflared installation failed')
+ results['cloudflared'] = 'failed'
+ else:
+ print(' ○ Skipped')
+ results['cloudflared'] = 'skipped'
+ # Step 6: Validation
+ step += 1
+ print(f'\nStep {step}/{total}: Validation')
+ from browser_use.skill_cli.config import CLI_DOCS_URL, get_config_display
+
+ # Quick checks
+ checks = {
+ 'package': _check_package(),
+ 'browser': 'ok' if _check_chromium() else 'missing',
+ 'profile_use': 'ok' if get_profile_use_binary() else 'missing',
+ 'cloudflared': 'ok' if shutil.which('cloudflared') else 'missing',
+ }
+ for name, status in checks.items():
+ icon = '✓' if status == 'ok' else '○'
+ print(f' {icon} {name}: {status}')
+
+ # Config display
+ entries = get_config_display()
+ print(f'\nConfig ({config_path}):')
+ for entry in entries:
+ if entry['is_set']:
+ icon = '✓'
+ val = 'set' if entry['sensitive'] else entry['value']
+ else:
+ icon = '○'
+ val = entry['value'] if entry['value'] else 'not set'
+ print(f' {icon} {entry["key"]}: {val}')
+ print(f' Docs: {CLI_DOCS_URL}')
+
+ print('\n━━━━━━━━━━━━━━━━━')
+ print('Setup complete! Next: browser-use open https://example.com\n')
+
+ results['status'] = 'success'
return results
-def _log_checks(checks: dict[str, Any]) -> None:
- """Log check results."""
- print('\n✓ Running checks...\n')
- for name, check in checks.items():
- status = check.get('status', 'unknown')
- message = check.get('message', '')
- icon = '✓' if status == 'ok' else '⚠' if status == 'missing' else '✗'
- print(f' {icon} {name.replace("_", " ")}: {message}')
- print()
+def _check_package() -> str:
+ """Check if browser-use package is importable."""
+ try:
+ import browser_use
+
+ version = getattr(browser_use, '__version__', 'unknown')
+ return f'browser-use {version}'
+ except ImportError:
+ return 'not installed'
-def _log_actions(actions: list[dict[str, Any]]) -> None:
- """Log planned actions."""
- if not actions:
- print('✓ No additional setup needed!\n')
- return
+def _check_chromium() -> bool:
+ """Check if playwright chromium is installed."""
+ try:
+ from browser_use.browser.profile import BrowserProfile
- print('\n📋 Setup actions:\n')
- for i, action in enumerate(actions, 1):
- required = '(required)' if action.get('required') else '(optional)'
- print(f' {i}. {action["description"]} {required}')
- print()
+ BrowserProfile(headless=True)
+ return True
+ except Exception:
+ return False
-def _log_validation(validation: dict[str, Any]) -> None:
- """Log validation results."""
- print('\n✓ Validation:\n')
- for name, result in validation.items():
- icon = '✓' if result == 'ok' else '✗'
- print(f' {icon} {name.replace("_", " ")}: {result}')
- print()
+def _install_chromium() -> bool:
+ """Install Chromium via playwright."""
+ try:
+ cmd = [sys.executable, '-m', 'playwright', 'install', 'chromium']
+ if sys.platform == 'linux':
+ cmd.append('--with-deps')
+ result = subprocess.run(cmd, timeout=300)
+ return result.returncode == 0
+ except Exception:
+ return False
+
+
+def _install_profile_use() -> bool:
+ """Download profile-use binary."""
+ try:
+ from browser_use.skill_cli.profile_use import download_profile_use
+
+ download_profile_use()
+ return True
+ except Exception:
+ return False
+
+
+def _install_cloudflared() -> bool:
+ """Install cloudflared."""
+ try:
+ if sys.platform == 'darwin':
+ result = subprocess.run(['brew', 'install', 'cloudflared'], timeout=120)
+ return result.returncode == 0
+ elif sys.platform == 'win32':
+ result = subprocess.run(['winget', 'install', 'Cloudflare.cloudflared'], timeout=120)
+ return result.returncode == 0
+ else:
+ # Linux: download binary + verify SHA256 checksum before installing
+ import hashlib
+ import platform
+ import shutil
+ import tempfile
+ import urllib.request
+
+ arch = 'arm64' if platform.machine() in ('aarch64', 'arm64') else 'amd64'
+ base_url = f'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-{arch}'
+
+ # Download to a temp file so we can verify before installing
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as tmp:
+ tmp_path = Path(tmp.name)
+ try:
+ urllib.request.urlretrieve(base_url, tmp_path)
+
+ # Fetch checksum file published alongside the binary
+ with urllib.request.urlopen(f'{base_url}.sha256sum') as resp:
+ expected_sha256 = resp.read().decode().split()[0]
+
+ # Verify integrity before touching the install destination
+ actual_sha256 = hashlib.sha256(tmp_path.read_bytes()).hexdigest()
+ if actual_sha256 != expected_sha256:
+ raise RuntimeError(
+ f'cloudflared checksum mismatch — expected {expected_sha256}, got {actual_sha256}. '
+ 'The download may be corrupt or tampered with.'
+ )
+
+ dest = Path('/usr/local/bin/cloudflared')
+ if not os.access('/usr/local/bin', os.W_OK):
+ dest = Path.home() / '.local' / 'bin' / 'cloudflared'
+ dest.parent.mkdir(parents=True, exist_ok=True)
+ shutil.move(str(tmp_path), dest)
+ dest.chmod(0o755)
+ finally:
+ tmp_path.unlink(missing_ok=True)
+ return True
+ except Exception:
+ return False
diff --git a/browser_use/skill_cli/commands/utils.py b/browser_use/skill_cli/commands/utils.py
deleted file mode 100644
index 88e61cb67..000000000
--- a/browser_use/skill_cli/commands/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Shared utilities for CLI command handlers."""
-
-from datetime import datetime, timezone
-
-from browser_use_sdk import BrowserUse
-
-_client: BrowserUse | None = None
-
-
-def get_sdk_client() -> BrowserUse:
- """Get authenticated SDK client (singleton)."""
- global _client
- if _client is None:
- from browser_use.skill_cli.api_key import require_api_key
-
- api_key = require_api_key('Cloud API')
- _client = BrowserUse(api_key=api_key)
- return _client
-
-
-def format_duration(started_at: datetime | None, finished_at: datetime | None) -> str:
- """Format duration between two timestamps, or elapsed time if still running."""
- if not started_at:
- return ''
-
- try:
- if finished_at:
- end = finished_at
- else:
- end = datetime.now(timezone.utc)
-
- delta = end - started_at
- total_seconds = int(delta.total_seconds())
-
- if total_seconds < 60:
- return f'{total_seconds}s'
- elif total_seconds < 3600:
- minutes = total_seconds // 60
- seconds = total_seconds % 60
- return f'{minutes}m {seconds}s'
- else:
- hours = total_seconds // 3600
- minutes = (total_seconds % 3600) // 60
- return f'{hours}h {minutes}m'
- except Exception:
- return ''
diff --git a/browser_use/skill_cli/config.py b/browser_use/skill_cli/config.py
new file mode 100644
index 000000000..47106209f
--- /dev/null
+++ b/browser_use/skill_cli/config.py
@@ -0,0 +1,171 @@
+"""CLI configuration schema and helpers.
+
+Single source of truth for all CLI config keys. Doctor, setup, and
+getter functions all reference CONFIG_KEYS.
+"""
+
+import json
+from pathlib import Path
+
+CLI_DOCS_URL = 'https://docs.browser-use.com/open-source/browser-use-cli'
+
+CONFIG_KEYS: dict = {
+ 'api_key': {
+ 'type': str,
+ 'sensitive': True,
+ 'description': 'Browser Use Cloud API key',
+ },
+ 'cloud_connect_profile_id': {
+ 'type': str,
+ 'description': 'Cloud browser profile ID (auto-created)',
+ },
+ 'cloud_connect_proxy': {
+ 'type': str,
+ 'default': 'us',
+ 'description': 'Cloud proxy country code',
+ },
+ 'cloud_connect_timeout': {
+ 'type': int,
+ 'description': 'Cloud browser timeout (minutes)',
+ },
+ 'cloud_connect_recording': {
+ 'type': bool,
+ 'default': True,
+ 'description': 'Enable session recording in cloud browser',
+ },
+}
+
+
+def _get_config_path() -> Path:
+ from browser_use.skill_cli.utils import get_config_path
+
+ return get_config_path()
+
+
+def read_config() -> dict:
+ """Read CLI config file. Returns empty dict if missing or corrupt."""
+ path = _get_config_path()
+ if path.exists():
+ try:
+ return json.loads(path.read_text())
+ except (json.JSONDecodeError, OSError):
+ return {}
+ return {}
+
+
+def write_config(data: dict) -> None:
+ """Write CLI config file with 0o600 permissions, atomically via tmp+rename.
+
+ Writing directly to config.json risks truncation if the process is killed
+ mid-write, which read_config() would silently treat as {} (empty config),
+ wiping the API key and all other settings.
+ """
+ import os
+ import tempfile
+
+ path = _get_config_path()
+ path.parent.mkdir(parents=True, exist_ok=True)
+ content = json.dumps(data, indent=2) + '\n'
+
+ # Write to a temp file in the same directory so os.replace() is atomic
+ # (same filesystem guaranteed — cross-device rename raises OSError).
+ fd, tmp_str = tempfile.mkstemp(dir=path.parent, prefix='.config_tmp_')
+ tmp_path = Path(tmp_str)
+ try:
+ with os.fdopen(fd, 'w') as f:
+ f.write(content)
+ f.flush()
+ os.fsync(f.fileno())
+ try:
+ tmp_path.chmod(0o600)
+ except OSError:
+ pass
+ os.replace(tmp_path, path)
+ except Exception:
+ tmp_path.unlink(missing_ok=True)
+ raise
+
+
+def get_config_value(key: str) -> str | int | None:
+ """Read a config value, applying schema defaults.
+
+ Priority: config file → schema default → None.
+ """
+ schema = CONFIG_KEYS.get(key)
+ if schema is None:
+ return None
+
+ config = read_config()
+ val = config.get(key)
+ if val is not None:
+ return val
+
+ return schema.get('default')
+
+
+def set_config_value(key: str, value: str) -> None:
+ """Set a config value. Validates key and coerces type."""
+ schema = CONFIG_KEYS.get(key)
+ if schema is None:
+ raise ValueError(f'Unknown config key: {key}. Valid keys: {", ".join(CONFIG_KEYS)}')
+
+ # Coerce type
+ expected_type = schema.get('type', str)
+ try:
+ if expected_type is int:
+ coerced = int(value)
+ elif expected_type is bool:
+ if value.lower() in ('true', '1', 'yes'):
+ coerced = True
+ elif value.lower() in ('false', '0', 'no'):
+ coerced = False
+ else:
+ raise ValueError(f'Invalid value for {key}: expected true/false, got {value!r}')
+ else:
+ coerced = str(value)
+ except (ValueError, TypeError):
+ raise ValueError(f'Invalid value for {key}: expected {expected_type.__name__}, got {value!r}')
+
+ config = read_config()
+ config[key] = coerced
+ write_config(config)
+
+
+def unset_config_value(key: str) -> None:
+ """Remove a config key from the file."""
+ schema = CONFIG_KEYS.get(key)
+ if schema is None:
+ raise ValueError(f'Unknown config key: {key}. Valid keys: {", ".join(CONFIG_KEYS)}')
+
+ config = read_config()
+ if key in config:
+ del config[key]
+ write_config(config)
+
+
+def get_config_display() -> list[dict]:
+ """Return config state for display (doctor, setup).
+
+ Each entry: {key, value, is_set, sensitive, description}
+ """
+ config = read_config()
+ entries = []
+ for key, schema in CONFIG_KEYS.items():
+ val = config.get(key)
+ is_set = val is not None
+
+ # Apply default for display
+ display_val = val
+ if not is_set and 'default' in schema:
+ display_val = f'{schema["default"]} (default)'
+
+ entries.append(
+ {
+ 'key': key,
+ 'value': display_val,
+ 'is_set': is_set,
+ 'sensitive': schema.get('sensitive', False),
+ 'description': schema.get('description', ''),
+ }
+ )
+ return entries
diff --git a/browser_use/skill_cli/daemon.py b/browser_use/skill_cli/daemon.py
new file mode 100644
index 000000000..95c8da5a0
--- /dev/null
+++ b/browser_use/skill_cli/daemon.py
@@ -0,0 +1,537 @@
+"""Background daemon - keeps a single BrowserSession alive.
+
+Each daemon owns one session, identified by a session name (default: 'default').
+Isolation is per-session: each gets its own socket and PID file.
+Auto-exits when browser dies (polls is_cdp_connected).
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import signal
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from browser_use.skill_cli.sessions import SessionInfo
+
+# Configure logging before imports
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
+ handlers=[logging.StreamHandler()],
+)
+logger = logging.getLogger('browser_use.skill_cli.daemon')
+
+
+class Daemon:
+ """Single-session daemon that manages a browser and handles CLI commands."""
+
+ def __init__(
+ self,
+ headed: bool,
+ profile: str | None,
+ cdp_url: str | None = None,
+ use_cloud: bool = False,
+ cloud_profile_id: str | None = None,
+ cloud_proxy_country_code: str | None = None,
+ cloud_timeout: int | None = None,
+ session: str = 'default',
+ ) -> None:
+ from browser_use.skill_cli.utils import validate_session_name
+
+ validate_session_name(session)
+ self.session = session
+ self.headed = headed
+ self.profile = profile
+ self.cdp_url = cdp_url
+ self.use_cloud = use_cloud
+ self.cloud_profile_id = cloud_profile_id
+ self.cloud_proxy_country_code = cloud_proxy_country_code
+ self.cloud_timeout = cloud_timeout
+ self.running = True
+ self._server: asyncio.Server | None = None
+ self._shutdown_event = asyncio.Event()
+ self._session: SessionInfo | None = None
+ self._shutdown_task: asyncio.Task | None = None
+ self._browser_watchdog_task: asyncio.Task | None = None
+ self._session_lock = asyncio.Lock()
+ self._last_command_time: float = 0.0
+ self._idle_timeout: float = 30 * 60.0 # 30 minutes
+ self._idle_watchdog_task: asyncio.Task | None = None
+ self._is_shutting_down: bool = False
+ self._auth_token: str = ''
+
+ def _write_state(self, phase: str) -> None:
+ """Atomically write session state file for CLI observability."""
+ import time
+
+ from browser_use.skill_cli.utils import get_home_dir
+
+ state = {
+ 'phase': phase,
+ 'pid': os.getpid(),
+ 'updated_at': time.time(),
+ 'config': {
+ 'headed': self.headed,
+ 'profile': self.profile,
+ 'cdp_url': self.cdp_url,
+ 'use_cloud': self.use_cloud,
+ },
+ }
+ state_path = get_home_dir() / f'{self.session}.state.json'
+ tmp_path = state_path.with_suffix('.state.json.tmp')
+ try:
+ with open(tmp_path, 'w') as f:
+ json.dump(state, f)
+ f.flush()
+ os.fsync(f.fileno())
+ os.replace(tmp_path, state_path)
+ except OSError as e:
+ logger.debug(f'Failed to write state file: {e}')
+
+ def _request_shutdown(self) -> None:
+ """Request shutdown exactly once. Safe from any context."""
+ if self._is_shutting_down:
+ return
+ self._is_shutting_down = True
+ self._shutdown_task = asyncio.create_task(self._shutdown())
+
+ async def _get_or_create_session(self) -> SessionInfo:
+ """Lazy-create the single session on first command."""
+ if self._session is not None:
+ return self._session
+
+ async with self._session_lock:
+ # Double-check after acquiring lock
+ if self._session is not None:
+ return self._session
+
+ from browser_use.skill_cli.sessions import SessionInfo, create_browser_session
+
+ logger.info(
+ f'Creating session (headed={self.headed}, profile={self.profile}, cdp_url={self.cdp_url}, use_cloud={self.use_cloud})'
+ )
+
+ self._write_state('starting')
+
+ bs = await create_browser_session(
+ self.headed,
+ self.profile,
+ self.cdp_url,
+ use_cloud=self.use_cloud,
+ cloud_profile_id=self.cloud_profile_id,
+ cloud_proxy_country_code=self.cloud_proxy_country_code,
+ cloud_timeout=self.cloud_timeout,
+ )
+
+ try:
+ await bs.start()
+ self._write_state('starting') # refresh updated_at after bs.start() returns
+
+ # Wait for Chrome to stabilize after CDP setup before accepting commands
+ try:
+ await bs.get_browser_state_summary()
+ except Exception:
+ pass
+
+ # Create action handler for direct command execution (no event bus)
+ from browser_use.skill_cli.actions import ActionHandler
+
+ actions = ActionHandler(bs)
+
+ self._session = SessionInfo(
+ name=self.session,
+ headed=self.headed,
+ profile=self.profile,
+ cdp_url=self.cdp_url,
+ browser_session=bs,
+ actions=actions,
+ use_cloud=self.use_cloud,
+ )
+ self._browser_watchdog_task = asyncio.create_task(self._watch_browser())
+
+ # Start idle timeout watchdog
+ self._idle_watchdog_task = asyncio.create_task(self._watch_idle())
+
+ except Exception:
+ # Startup failed — rollback browser resources
+ logger.exception('Session startup failed, rolling back')
+ self._write_state('failed')
+ try:
+ if self.use_cloud and hasattr(bs, '_cloud_browser_client') and bs._cloud_browser_client.current_session_id:
+ await asyncio.wait_for(bs._cloud_browser_client.stop_browser(), timeout=10.0)
+ elif not self.cdp_url and not self.use_cloud:
+ await asyncio.wait_for(bs.kill(), timeout=10.0)
+ else:
+ await asyncio.wait_for(bs.stop(), timeout=10.0)
+ except Exception as cleanup_err:
+ logger.debug(f'Rollback cleanup error: {cleanup_err}')
+ raise
+
+ self._write_state('running')
+ return self._session
+
+ async def _watch_browser(self) -> None:
+ """Poll BrowserSession.is_cdp_connected every 2s. Shutdown when browser dies.
+
+ Skips checks while the BrowserSession is reconnecting. If reconnection fails,
+ next poll will see is_cdp_connected=False and trigger shutdown.
+ """
+ while self.running:
+ await asyncio.sleep(2.0)
+ if not self._session:
+ continue
+ bs = self._session.browser_session
+ # Don't shut down while a reconnection attempt is in progress
+ if bs.is_reconnecting:
+ continue
+ if not bs.is_cdp_connected:
+ logger.info('Browser disconnected, shutting down daemon')
+ self._request_shutdown()
+ return
+
+ async def _watch_idle(self) -> None:
+ """Shutdown daemon after idle_timeout seconds of no commands."""
+ while self.running:
+ await asyncio.sleep(60.0)
+ if self._last_command_time > 0:
+ import time
+
+ idle = time.monotonic() - self._last_command_time
+ if idle >= self._idle_timeout:
+ logger.info(f'Daemon idle for {idle:.0f}s, shutting down')
+ self._request_shutdown()
+ return
+
+ async def handle_connection(
+ self,
+ reader: asyncio.StreamReader,
+ writer: asyncio.StreamWriter,
+ ) -> None:
+ """Handle a single client request (one command per connection)."""
+ try:
+ line = await asyncio.wait_for(reader.readline(), timeout=300)
+ if not line:
+ return
+
+ request = {}
+ try:
+ import hmac
+
+ request = json.loads(line.decode())
+ req_id = request.get('id', '')
+ # Reject requests that don't carry the correct auth token.
+ # Use hmac.compare_digest to prevent timing-oracle attacks.
+ if self._auth_token and not hmac.compare_digest(
+ request.get('token', ''),
+ self._auth_token,
+ ):
+ response = {'id': req_id, 'success': False, 'error': 'Unauthorized'}
+ else:
+ response = await self.dispatch(request)
+ except json.JSONDecodeError as e:
+ response = {'id': '', 'success': False, 'error': f'Invalid JSON: {e}'}
+ except Exception as e:
+ logger.exception(f'Error handling request: {e}')
+ response = {'id': '', 'success': False, 'error': str(e)}
+
+ writer.write((json.dumps(response) + '\n').encode())
+ await writer.drain()
+
+ if response.get('success') and request.get('action') == 'shutdown':
+ self._request_shutdown()
+
+ except TimeoutError:
+ logger.debug('Connection timeout')
+ except Exception as e:
+ logger.exception(f'Connection error: {e}')
+ finally:
+ writer.close()
+ try:
+ await writer.wait_closed()
+ except Exception:
+ pass
+
+ async def dispatch(self, request: dict) -> dict:
+ """Route to command handlers."""
+ import time
+
+ self._last_command_time = time.monotonic()
+
+ action = request.get('action', '')
+ params = request.get('params', {})
+ req_id = request.get('id', '')
+
+ logger.info(f'Dispatch: {action} (id={req_id})')
+
+ try:
+ # Handle shutdown
+ if action == 'shutdown':
+ return {'id': req_id, 'success': True, 'data': {'shutdown': True}}
+
+ # Handle ping — returns daemon config for mismatch detection
+ if action == 'ping':
+ # Return live CDP URL (may differ from constructor arg for cloud sessions)
+ live_cdp_url = self.cdp_url
+ if self._session and self._session.browser_session.cdp_url:
+ live_cdp_url = self._session.browser_session.cdp_url
+ return {
+ 'id': req_id,
+ 'success': True,
+ 'data': {
+ 'session': self.session,
+ 'pid': os.getpid(),
+ 'headed': self.headed,
+ 'profile': self.profile,
+ 'cdp_url': live_cdp_url,
+ 'use_cloud': self.use_cloud,
+ },
+ }
+
+ # Handle connect — forces immediate session creation (used by cloud connect)
+ if action == 'connect':
+ session = await self._get_or_create_session()
+ bs = session.browser_session
+ result_data: dict = {'status': 'connected'}
+ if bs.cdp_url:
+ result_data['cdp_url'] = bs.cdp_url
+ if self.use_cloud and bs.cdp_url:
+ from urllib.parse import quote
+
+ result_data['live_url'] = f'https://live.browser-use.com/?wss={quote(bs.cdp_url, safe="")}'
+ return {'id': req_id, 'success': True, 'data': result_data}
+
+ from browser_use.skill_cli.commands import browser, python_exec
+
+ # Get or create the single session
+ session = await self._get_or_create_session()
+
+ # Dispatch to handler
+ if action in browser.COMMANDS:
+ result = await browser.handle(action, session, params)
+ elif action == 'python':
+ result = await python_exec.handle(session, params)
+ else:
+ return {'id': req_id, 'success': False, 'error': f'Unknown action: {action}'}
+
+ return {'id': req_id, 'success': True, 'data': result}
+
+ except Exception as e:
+ logger.exception(f'Error dispatching {action}: {e}')
+ return {'id': req_id, 'success': False, 'error': str(e)}
+
+ async def run(self) -> None:
+ """Listen on Unix socket (or TCP on Windows) with PID file.
+
+ Note: we do NOT unlink the socket in our finally block. If a replacement
+ daemon was spawned during our shutdown, it already bound a new socket at
+ the same path — unlinking here would delete *its* socket, orphaning it.
+ Stale sockets are cleaned up by is_daemon_alive() and by the next
+ daemon's startup (unlink before bind).
+ """
+ import secrets
+
+ from browser_use.skill_cli.utils import get_auth_token_path, get_pid_path, get_socket_path
+
+ self._write_state('initializing')
+
+ # Generate and persist a per-session auth token.
+ # The client reads this file to authenticate its requests, preventing
+ # any other local process from sending commands to the daemon socket.
+ # Create the temp file with 0o600 at open() time to avoid a permission
+ # race window where the file exists but is not yet restricted.
+ # Raise on failure — running without a readable token file leaves the
+ # daemon permanently unauthorized for all clients.
+ self._auth_token = secrets.token_hex(32)
+ token_path = get_auth_token_path(self.session)
+ tmp_token = token_path.with_suffix('.token.tmp')
+ fd = os.open(str(tmp_token), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
+ try:
+ with os.fdopen(fd, 'w') as f:
+ f.write(self._auth_token)
+ except OSError:
+ try:
+ tmp_token.unlink(missing_ok=True)
+ except OSError:
+ pass
+ raise
+ os.replace(tmp_token, token_path)
+
+ # Setup signal handlers
+ loop = asyncio.get_running_loop()
+
+ def signal_handler():
+ self._request_shutdown()
+
+ for sig in (signal.SIGINT, signal.SIGTERM):
+ try:
+ loop.add_signal_handler(sig, signal_handler)
+ except NotImplementedError:
+ pass # Windows doesn't support add_signal_handler
+
+ if hasattr(signal, 'SIGHUP'):
+ try:
+ loop.add_signal_handler(signal.SIGHUP, signal_handler)
+ except NotImplementedError:
+ pass
+
+ sock_path = get_socket_path(self.session)
+ pid_path = get_pid_path(self.session)
+ logger.info(f'Session: {self.session}, Socket: {sock_path}')
+
+ if sock_path.startswith('tcp://'):
+ # Windows: TCP server
+ _, hostport = sock_path.split('://', 1)
+ host, port = hostport.split(':')
+ self._server = await asyncio.start_server(
+ self.handle_connection,
+ host,
+ int(port),
+ reuse_address=True,
+ )
+ logger.info(f'Listening on TCP {host}:{port}')
+ else:
+ # Unix: socket server
+ Path(sock_path).unlink(missing_ok=True)
+ self._server = await asyncio.start_unix_server(
+ self.handle_connection,
+ sock_path,
+ )
+ logger.info(f'Listening on Unix socket {sock_path}')
+
+ # Write PID file after server is bound
+ my_pid = str(os.getpid())
+ pid_path.write_text(my_pid)
+ self._write_state('ready')
+
+ try:
+ async with self._server:
+ await self._shutdown_event.wait()
+ # Wait for shutdown to finish browser cleanup before exiting
+ if self._shutdown_task:
+ await self._shutdown_task
+ except asyncio.CancelledError:
+ pass
+ finally:
+ # Conditionally delete PID file only if it still contains our PID
+ try:
+ if pid_path.read_text().strip() == my_pid:
+ pid_path.unlink(missing_ok=True)
+ except (OSError, ValueError):
+ pass
+ logger.info('Daemon stopped')
+
+ async def _shutdown(self) -> None:
+ """Graceful shutdown. Only called via _request_shutdown().
+
+ Order matters: close the server first to release the socket/port
+ immediately, so a replacement daemon can bind without waiting for
+ browser cleanup. Then kill the browser session.
+ """
+ logger.info('Shutting down daemon...')
+ self._write_state('shutting_down')
+ self.running = False
+ self._shutdown_event.set()
+
+ if self._browser_watchdog_task:
+ self._browser_watchdog_task.cancel()
+
+ if self._idle_watchdog_task:
+ self._idle_watchdog_task.cancel()
+
+ if self._server:
+ self._server.close()
+
+ if self._session:
+ try:
+ # Only kill the browser if the daemon launched it.
+ # For external connections (--connect, --cdp-url, cloud), just disconnect.
+ # Timeout ensures daemon exits even if CDP calls hang on a dead connection
+ if self.cdp_url or self.use_cloud:
+ await asyncio.wait_for(self._session.browser_session.stop(), timeout=10.0)
+ else:
+ await asyncio.wait_for(self._session.browser_session.kill(), timeout=10.0)
+ except TimeoutError:
+ logger.warning('Browser cleanup timed out after 10s, forcing exit')
+ except Exception as e:
+ logger.warning(f'Error closing session: {e}')
+ self._session = None
+
+ # Delete PID and auth token files last, right before exit.
+ import os
+
+ from browser_use.skill_cli.utils import get_auth_token_path, get_pid_path
+
+ pid_path = get_pid_path(self.session)
+ try:
+ if pid_path.exists() and pid_path.read_text().strip() == str(os.getpid()):
+ pid_path.unlink(missing_ok=True)
+ except (OSError, ValueError):
+ pass
+
+ get_auth_token_path(self.session).unlink(missing_ok=True)
+
+ self._write_state('stopped')
+
+ # Force exit — the asyncio server's __aexit__ hangs waiting for the
+ # handle_connection() call that triggered this shutdown to return.
+ logger.info('Daemon process exiting')
+ os._exit(0)
+
+
+def main() -> None:
+ """Main entry point for daemon process."""
+ parser = argparse.ArgumentParser(description='Browser-use daemon')
+ parser.add_argument('--session', default='default', help='Session name (default: "default")')
+ parser.add_argument('--headed', action='store_true', help='Show browser window')
+ parser.add_argument('--profile', help='Chrome profile (triggers real Chrome mode)')
+ parser.add_argument('--cdp-url', help='CDP URL to connect to')
+ parser.add_argument('--use-cloud', action='store_true', help='Use cloud browser')
+ parser.add_argument('--cloud-profile-id', help='Cloud browser profile ID')
+ parser.add_argument('--cloud-proxy-country', help='Cloud browser proxy country code')
+ parser.add_argument('--cloud-timeout', type=int, help='Cloud browser timeout in minutes')
+ args = parser.parse_args()
+
+ logger.info(
+ f'Starting daemon: session={args.session}, headed={args.headed}, profile={args.profile}, cdp_url={args.cdp_url}, use_cloud={args.use_cloud}'
+ )
+
+ daemon = Daemon(
+ headed=args.headed,
+ profile=args.profile,
+ cdp_url=args.cdp_url,
+ use_cloud=args.use_cloud,
+ cloud_profile_id=args.cloud_profile_id,
+ cloud_proxy_country_code=args.cloud_proxy_country,
+ cloud_timeout=args.cloud_timeout,
+ session=args.session,
+ )
+
+ exit_code = 0
+ try:
+ asyncio.run(daemon.run())
+ except KeyboardInterrupt:
+ logger.info('Interrupted')
+ except Exception as e:
+ logger.exception(f'Daemon error: {e}')
+ exit_code = 1
+ finally:
+ # Write failed state if we crashed without a clean shutdown
+ if not daemon._is_shutting_down:
+ try:
+ daemon._write_state('failed')
+ except Exception:
+ pass
+ # asyncio.run() may hang trying to cancel lingering tasks
+ # Force-exit to prevent the daemon from becoming an orphan
+ logger.info('Daemon process exiting')
+ os._exit(exit_code)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/browser_use/skill_cli/install.sh b/browser_use/skill_cli/install.sh
index 23847d7a2..41bb40713 100755
--- a/browser_use/skill_cli/install.sh
+++ b/browser_use/skill_cli/install.sh
@@ -2,17 +2,8 @@
# Browser-Use Bootstrap Installer
#
# Usage:
-# # Interactive install (shows mode selection TUI)
# curl -fsSL https://browser-use.com/cli/install.sh | bash
#
-# # Non-interactive install with flags
-# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full
-# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only
-# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --local-only
-#
-# # With API key
-# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only --api-key bu_xxx
-#
# For development testing:
# curl -fsSL | BROWSER_USE_BRANCH= bash
#
@@ -24,7 +15,7 @@
# winget install Git.Git
#
# Then run from PowerShell:
-# & "C:\Program Files\Git\bin\bash.exe" -c 'curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full'
+# & "C:\Program Files\Git\bin\bash.exe" -c 'curl -fsSL https://browser-use.com/cli/install.sh | bash'
#
# KNOWN ISSUES AND SOLUTIONS:
#
@@ -76,10 +67,10 @@
# - Always kill stale processes before retrying
# - Or kill all Python: taskkill /IM python.exe /F
#
-# 7. Debugging server issues
-# To see actual error messages instead of "Failed to start session server":
-# & "$env:USERPROFILE\.browser-use-env\Scripts\python.exe" -m browser_use.skill_cli.server --session default --browser chromium
-# This runs the server in foreground and shows all errors.
+# 7. Debugging daemon issues
+# To see actual error messages instead of "Failed to start daemon":
+# & "$env:USERPROFILE\.browser-use-env\Scripts\python.exe" -m browser_use.skill_cli.daemon
+# This runs the daemon in foreground and shows all errors.
#
# =============================================================================
@@ -89,12 +80,6 @@ set -e
# Configuration
# =============================================================================
-# Mode flags (set by parse_args or TUI)
-INSTALL_LOCAL=false
-INSTALL_REMOTE=false
-SKIP_INTERACTIVE=false
-API_KEY=""
-
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -130,43 +115,15 @@ log_error() {
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
- --full|--all)
- INSTALL_LOCAL=true
- INSTALL_REMOTE=true
- SKIP_INTERACTIVE=true
- shift
- ;;
- --remote-only)
- INSTALL_REMOTE=true
- SKIP_INTERACTIVE=true
- shift
- ;;
- --local-only)
- INSTALL_LOCAL=true
- SKIP_INTERACTIVE=true
- shift
- ;;
- --api-key)
- if [ -z "$2" ] || [[ "$2" == --* ]]; then
- log_error "--api-key requires a value"
- exit 1
- fi
- API_KEY="$2"
- shift 2
- ;;
--help|-h)
echo "Browser-Use Installer"
echo ""
echo "Usage: install.sh [OPTIONS]"
echo ""
echo "Options:"
- echo " --full, --all Install all modes (local + remote)"
- echo " --remote-only Install remote mode only (no Chromium)"
- echo " --local-only Install local modes only (no cloudflared)"
- echo " --api-key KEY Set Browser-Use API key"
echo " --help, -h Show this help"
echo ""
- echo "Without options, shows interactive mode selection."
+ echo "Installs Python 3.11+ (if needed), uv, browser-use, and Chromium."
exit 0
;;
*)
@@ -331,6 +288,10 @@ install_python() {
install_uv() {
log_info "Installing uv package manager..."
+ # Add common uv install locations to PATH for current session
+ # (covers both curl-based and Homebrew installs)
+ export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
+
if command -v uv &> /dev/null; then
log_success "uv already installed"
return 0
@@ -339,9 +300,6 @@ install_uv() {
# Use official uv installer
curl -LsSf https://astral.sh/uv/install.sh | sh
- # Add common uv install locations to PATH for current session
- export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
-
if command -v uv &> /dev/null; then
log_success "uv installed successfully"
else
@@ -350,121 +308,6 @@ install_uv() {
fi
}
-# =============================================================================
-# Gum TUI installation
-# =============================================================================
-
-install_gum() {
- # Install gum for beautiful TUI - silent and fast
- if command -v gum &> /dev/null; then
- return 0
- fi
-
- local arch=$(uname -m)
- local gum_version="0.14.5"
- local gum_dir=""
-
- mkdir -p "$HOME/.local/bin"
- export PATH="$HOME/.local/bin:$PATH"
-
- case "$PLATFORM" in
- macos)
- if [ "$arch" = "arm64" ]; then
- gum_dir="gum_${gum_version}_Darwin_arm64"
- curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Darwin_arm64.tar.gz" | tar -xz -C /tmp
- else
- gum_dir="gum_${gum_version}_Darwin_x86_64"
- curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Darwin_x86_64.tar.gz" | tar -xz -C /tmp
- fi
- mv "/tmp/${gum_dir}/gum" "$HOME/.local/bin/" 2>/dev/null || return 1
- rm -rf "/tmp/${gum_dir}" 2>/dev/null
- ;;
- linux)
- if [ "$arch" = "aarch64" ] || [ "$arch" = "arm64" ]; then
- gum_dir="gum_${gum_version}_Linux_arm64"
- curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Linux_arm64.tar.gz" | tar -xz -C /tmp
- else
- gum_dir="gum_${gum_version}_Linux_x86_64"
- curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Linux_x86_64.tar.gz" | tar -xz -C /tmp
- fi
- mv "/tmp/${gum_dir}/gum" "$HOME/.local/bin/" 2>/dev/null || return 1
- rm -rf "/tmp/${gum_dir}" 2>/dev/null
- ;;
- windows)
- # Download and extract Windows binary
- curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Windows_x86_64.zip" -o /tmp/gum.zip
- unzip -q /tmp/gum.zip -d /tmp/gum_windows 2>/dev/null || return 1
- # Binary is inside a subdirectory: gum_x.x.x_Windows_x86_64/gum.exe
- mv "/tmp/gum_windows/gum_${gum_version}_Windows_x86_64/gum.exe" "$HOME/.local/bin/" 2>/dev/null || return 1
- rm -rf /tmp/gum.zip /tmp/gum_windows 2>/dev/null
- ;;
- *)
- return 1
- ;;
- esac
-
- command -v gum &> /dev/null
-}
-
-# =============================================================================
-# Interactive mode selection TUI
-# =============================================================================
-
-show_mode_menu() {
- # Try to install gum for nice TUI
- if install_gum; then
- show_gum_menu
- else
- show_bash_menu
- fi
-}
-
-show_gum_menu() {
- echo ""
-
- # Styled header
- gum style --foreground 212 --bold "Select browser modes to install"
- gum style --foreground 240 "Use arrow keys to navigate, space to select, enter to confirm"
- echo ""
-
- # Checkbox selection with gum choose
- set +e
- SELECTED=$(gum choose --no-limit --height 10 \
- --cursor-prefix "[ ] " --selected-prefix "[✓] " --unselected-prefix "[ ] " \
- --header "" \
- --cursor.foreground 212 \
- --selected.foreground 212 \
- "Local browser (chromium/real - requires Chromium)" \
- "Remote browser (cloud - requires API key)" < /dev/tty)
- set -e
-
- # Parse selections
- if [[ "$SELECTED" == *"Local"* ]]; then INSTALL_LOCAL=true; fi
- if [[ "$SELECTED" == *"Remote"* ]]; then INSTALL_REMOTE=true; fi
-}
-
-show_bash_menu() {
- echo ""
- echo "Select browser modes to install (space-separated numbers):"
- echo ""
- echo " 1) Local browser (chromium/real - requires Chromium download)"
- echo " 2) Remote browser (cloud - requires API key)"
- echo ""
- echo "Press Enter for default [1]"
- echo ""
- echo -n "> "
-
- # Read from /dev/tty to work even when script is piped
- # Keep set +e for the whole function to avoid issues with pattern matching
- set +e
- read -r choices < /dev/tty
- choices=${choices:-1}
-
- if [[ "$choices" == *"1"* ]]; then INSTALL_LOCAL=true; fi
- if [[ "$choices" == *"2"* ]]; then INSTALL_REMOTE=true; fi
- set -e
-}
-
# =============================================================================
# Browser-Use installation
# =============================================================================
@@ -515,119 +358,19 @@ install_chromium() {
log_success "Chromium installed"
}
-install_cloudflared() {
- log_info "Installing cloudflared..."
+install_profile_use() {
+ log_info "Installing profile-use..."
- if command -v cloudflared &> /dev/null; then
- log_success "cloudflared already installed"
- return 0
- fi
+ mkdir -p "$HOME/.browser-use/bin"
+ curl -fsSL https://browser-use.com/profile/cli/install.sh | PROFILE_USE_VERSION=v1.0.2 INSTALL_DIR="$HOME/.browser-use/bin" sh
- local arch=$(uname -m)
-
- case "$PLATFORM" in
- macos)
- if command -v brew &> /dev/null; then
- brew install cloudflared
- else
- # Direct download for macOS without Homebrew
- mkdir -p "$HOME/.local/bin"
- if [ "$arch" = "arm64" ]; then
- curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-darwin-arm64.tgz -o /tmp/cloudflared.tgz
- else
- curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-darwin-amd64.tgz -o /tmp/cloudflared.tgz
- fi
- tar -xzf /tmp/cloudflared.tgz -C "$HOME/.local/bin/"
- rm /tmp/cloudflared.tgz
- fi
- ;;
- linux)
- mkdir -p "$HOME/.local/bin"
- if [ "$arch" = "aarch64" ] || [ "$arch" = "arm64" ]; then
- curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-arm64 -o "$HOME/.local/bin/cloudflared"
- else
- curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o "$HOME/.local/bin/cloudflared"
- fi
- chmod +x "$HOME/.local/bin/cloudflared"
- ;;
- windows)
- # Auto-install via winget (comes pre-installed on Windows 10/11)
- if command -v winget.exe &> /dev/null; then
- winget.exe install --id Cloudflare.cloudflared --accept-source-agreements --accept-package-agreements --silent
- else
- log_warn "winget not found. Install cloudflared manually:"
- log_warn " Download from: https://github.com/cloudflare/cloudflared/releases"
- return 0
- fi
- ;;
- esac
-
- # Add ~/.local/bin to PATH for current session
- export PATH="$HOME/.local/bin:$PATH"
-
- if command -v cloudflared &> /dev/null; then
- log_success "cloudflared installed successfully"
+ if [ -x "$HOME/.browser-use/bin/profile-use" ]; then
+ log_success "profile-use installed"
else
- log_warn "cloudflared installation failed. You can install it manually later."
+ log_warn "profile-use installation failed (will auto-download on first use)"
fi
}
-# =============================================================================
-# Install dependencies based on selected modes
-# =============================================================================
-
-install_dependencies() {
- # Install base package (always needed)
- install_browser_use
-
- # Install Chromium only if local mode selected
- if [ "$INSTALL_LOCAL" = true ]; then
- install_chromium
- else
- log_info "Skipping Chromium (remote-only mode)"
- fi
-
- # Install cloudflared only if remote mode selected
- if [ "$INSTALL_REMOTE" = true ]; then
- install_cloudflared
- else
- log_info "Skipping cloudflared (local-only mode)"
- fi
-}
-
-# =============================================================================
-# Write install configuration
-# =============================================================================
-
-write_install_config() {
- # Determine installed modes and default
- local modes=""
- local default_mode=""
-
- if [ "$INSTALL_LOCAL" = true ] && [ "$INSTALL_REMOTE" = true ]; then
- modes='["chromium", "real", "remote"]'
- default_mode="chromium"
- elif [ "$INSTALL_REMOTE" = true ]; then
- modes='["remote"]'
- default_mode="remote"
- else
- modes='["chromium", "real"]'
- default_mode="chromium"
- fi
-
- # Write config file
- mkdir -p "$HOME/.browser-use"
- cat > "$HOME/.browser-use/install-config.json" << EOF
-{
- "installed_modes": $modes,
- "default_mode": "$default_mode"
-}
-EOF
-
- local mode_names=$(echo $modes | tr -d '[]"' | tr ',' ' ')
- log_success "Configured: $mode_names"
-}
-
# =============================================================================
# PATH configuration
# =============================================================================
@@ -637,20 +380,19 @@ configure_path() {
local bin_path=$(get_venv_bin_dir)
local local_bin="$HOME/.local/bin"
- # Detect shell
- if [ -n "$BASH_VERSION" ]; then
- shell_rc="$HOME/.bashrc"
- elif [ -n "$ZSH_VERSION" ]; then
- shell_rc="$HOME/.zshrc"
- else
- shell_rc="$HOME/.profile"
- fi
+ # Detect user's login shell (not the running shell, since this script
+ # is typically executed via "curl ... | bash" which always sets BASH_VERSION)
+ case "$(basename "$SHELL")" in
+ zsh) shell_rc="$HOME/.zshrc" ;;
+ bash) shell_rc="$HOME/.bashrc" ;;
+ *) shell_rc="$HOME/.profile" ;;
+ esac
# Check if already in PATH (browser-use-env matches both /bin and /Scripts)
if grep -q "browser-use-env" "$shell_rc" 2>/dev/null; then
log_info "PATH already configured in $shell_rc"
else
- # Add to shell config (includes ~/.local/bin for cloudflared)
+ # Add to shell config (includes ~/.local/bin for tools)
echo "" >> "$shell_rc"
echo "# Browser-Use" >> "$shell_rc"
echo "export PATH=\"$bin_path:$local_bin:\$PATH\"" >> "$shell_rc"
@@ -689,32 +431,6 @@ configure_powershell_path() {
fi
}
-# =============================================================================
-# Setup wizard
-# =============================================================================
-
-run_setup() {
- log_info "Running setup wizard..."
-
- # Activate venv
- activate_venv
-
- # Determine profile based on mode selections
- local profile="local"
- if [ "$INSTALL_REMOTE" = true ] && [ "$INSTALL_LOCAL" = true ]; then
- profile="full"
- elif [ "$INSTALL_REMOTE" = true ]; then
- profile="remote"
- fi
-
- # Run setup with API key if provided
- if [ -n "$API_KEY" ]; then
- browser-use setup --mode "$profile" --api-key "$API_KEY" --yes
- else
- browser-use setup --mode "$profile" --yes
- fi
-}
-
# =============================================================================
# Validation
# =============================================================================
@@ -738,34 +454,18 @@ validate() {
# =============================================================================
print_next_steps() {
- # Detect shell for source command
- local shell_rc=".bashrc"
- if [ -n "$ZSH_VERSION" ]; then
- shell_rc=".zshrc"
- fi
+ # Detect shell for source command (must match configure_path logic)
+ case "$(basename "$SHELL")" in
+ zsh) local shell_rc=".zshrc" ;;
+ bash) local shell_rc=".bashrc" ;;
+ *) local shell_rc=".profile" ;;
+ esac
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
log_success "Browser-Use installed successfully!"
echo ""
- echo "Installed modes:"
- [ "$INSTALL_LOCAL" = true ] && echo " ✓ Local (chromium, real)"
- [ "$INSTALL_REMOTE" = true ] && echo " ✓ Remote (cloud)"
- echo ""
-
- # Show API key instructions if remote selected but no key provided
- if [ "$INSTALL_REMOTE" = true ] && [ -z "$API_KEY" ]; then
- echo "⚠ API key required for remote mode:"
- if [ "$PLATFORM" = "windows" ]; then
- echo " \$env:BROWSER_USE_API_KEY=\"\""
- else
- echo " export BROWSER_USE_API_KEY="
- fi
- echo ""
- echo " Get your API key at: https://browser-use.com"
- echo ""
- fi
echo "Next steps:"
if [ "$PLATFORM" = "windows" ]; then
@@ -773,13 +473,7 @@ print_next_steps() {
else
echo " 1. Restart your shell or run: source ~/$shell_rc"
fi
-
- if [ "$INSTALL_REMOTE" = true ] && [ -z "$API_KEY" ]; then
- echo " 2. Set your API key (see above)"
- echo " 3. Try: browser-use open https://example.com"
- else
- echo " 2. Try: browser-use open https://example.com"
- fi
+ echo " 2. Try: browser-use open https://example.com"
echo ""
echo "Documentation: https://docs.browser-use.com"
@@ -801,25 +495,13 @@ main() {
# Parse command-line flags
parse_args "$@"
- # Show install mode if flags provided
- if [ "$SKIP_INTERACTIVE" = true ]; then
- if [ "$INSTALL_LOCAL" = true ] && [ "$INSTALL_REMOTE" = true ]; then
- log_info "Install mode: full (local + remote)"
- elif [ "$INSTALL_REMOTE" = true ]; then
- log_info "Install mode: remote-only"
- else
- log_info "Install mode: local-only"
- fi
- echo ""
- fi
-
# Step 1: Detect platform
detect_platform
# Step 2: Check/install Python
if ! check_python; then
# In CI or non-interactive mode (no tty), auto-install Python
- if [ ! -t 0 ] || [ "$SKIP_INTERACTIVE" = true ]; then
+ if [ ! -t 0 ]; then
log_info "Python 3.11+ not found. Installing automatically..."
install_python
else
@@ -837,35 +519,29 @@ main() {
# Step 3: Install uv
install_uv
- # Step 4: Show mode selection TUI (unless skipped via flags)
- if [ "$SKIP_INTERACTIVE" = false ]; then
- show_mode_menu
+ # Step 4: Install browser-use package
+ install_browser_use
+
+ # Step 5: Install Chromium
+ install_chromium
+
+ # Step 6: Install profile-use
+ install_profile_use
+
+ # Step 6.5: Create config.json if it doesn't exist
+ config_file="$HOME/.browser-use/config.json"
+ if [ ! -f "$config_file" ]; then
+ echo '{}' > "$config_file"
+ chmod 600 "$config_file"
fi
- # Default to local-only if nothing selected
- if [ "$INSTALL_LOCAL" = false ] && [ "$INSTALL_REMOTE" = false ]; then
- log_warn "No modes selected, defaulting to local"
- INSTALL_LOCAL=true
- fi
-
- echo ""
-
- # Step 5: Install dependencies
- install_dependencies
-
- # Step 6: Write install config
- write_install_config
-
# Step 7: Configure PATH
configure_path
- # Step 8: Run setup wizard
- run_setup
-
- # Step 9: Validate
+ # Step 8: Validate
validate
- # Step 10: Show next steps
+ # Step 9: Print next steps
print_next_steps
}
diff --git a/browser_use/skill_cli/install_config.py b/browser_use/skill_cli/install_config.py
deleted file mode 100644
index ff2ad4c18..000000000
--- a/browser_use/skill_cli/install_config.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""Install configuration - tracks which browser modes are available.
-
-This module manages the installation configuration that determines which browser
-modes (chromium, real, remote) are available based on how browser-use was installed.
-
-Config file: ~/.browser-use/install-config.json
-
-When no config file exists (e.g., pip install users), all modes are available by default.
-"""
-
-import json
-from pathlib import Path
-from typing import Literal
-
-CONFIG_PATH = Path.home() / '.browser-use' / 'install-config.json'
-
-ModeType = Literal['chromium', 'real', 'remote']
-
-# Local modes (both require Chromium to be installed)
-LOCAL_MODES: set[str] = {'chromium', 'real'}
-
-
-def get_config() -> dict:
- """Read install config. Returns default if not found.
-
- Default config enables all modes (for pip install users).
- """
- if not CONFIG_PATH.exists():
- return {
- 'installed_modes': ['chromium', 'real', 'remote'],
- 'default_mode': 'chromium',
- }
-
- try:
- return json.loads(CONFIG_PATH.read_text())
- except (json.JSONDecodeError, OSError):
- # Config file corrupt, return default
- return {
- 'installed_modes': ['chromium', 'real', 'remote'],
- 'default_mode': 'chromium',
- }
-
-
-def save_config(installed_modes: list[str], default_mode: str) -> None:
- """Save install config."""
- CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
- CONFIG_PATH.write_text(
- json.dumps(
- {
- 'installed_modes': installed_modes,
- 'default_mode': default_mode,
- },
- indent=2,
- )
- )
-
-
-def is_mode_available(mode: str) -> bool:
- """Check if a browser mode is available based on installation config.
-
- Args:
- mode: The browser mode to check ('chromium', 'real', or 'remote')
-
- Returns:
- True if the mode is available, False otherwise
- """
- config = get_config()
- installed = config.get('installed_modes', [])
-
- # Map 'real' to same category as 'chromium' (both are local)
- # If either local mode is installed, both are available
- if mode in LOCAL_MODES:
- return bool(LOCAL_MODES & set(installed))
-
- return mode in installed
-
-
-def get_default_mode() -> str:
- """Get the default browser mode based on installation config."""
- return get_config().get('default_mode', 'chromium')
-
-
-def get_available_modes() -> list[str]:
- """Get list of available browser modes."""
- return get_config().get('installed_modes', ['chromium', 'real', 'remote'])
-
-
-def get_mode_unavailable_error(mode: str) -> str:
- """Generate a helpful error message when a mode is not available.
-
- Args:
- mode: The unavailable mode that was requested
-
- Returns:
- A formatted error message with instructions for reinstalling
- """
- available = get_available_modes()
-
- if mode in LOCAL_MODES:
- install_flag = '--full'
- mode_desc = 'Local browser mode'
- else:
- install_flag = '--full'
- mode_desc = 'Remote browser mode'
-
- return (
- f"Error: {mode_desc} '{mode}' not installed.\n"
- f'Available modes: {", ".join(available)}\n\n'
- f'To install all modes, reinstall with:\n'
- f' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- {install_flag}'
- )
diff --git a/browser_use/skill_cli/install_lite.sh b/browser_use/skill_cli/install_lite.sh
new file mode 100755
index 000000000..d38cd7330
--- /dev/null
+++ b/browser_use/skill_cli/install_lite.sh
@@ -0,0 +1,510 @@
+#!/usr/bin/env bash
+# Browser-Use Lightweight CLI Installer
+#
+# Installs only the minimal dependencies needed for the CLI (~10 packages
+# instead of ~50). Use this if you only need the browser-use CLI commands
+# and don't need the Python library (Agent, LLM integrations, etc.).
+#
+# Usage:
+# curl -fsSL /install_lite.sh | bash
+#
+# For development testing:
+# curl -fsSL | BROWSER_USE_BRANCH= bash
+#
+# To install the full library instead, use install.sh.
+#
+# =============================================================================
+
+set -e
+
+# =============================================================================
+# Prerequisites
+# =============================================================================
+
+if ! command -v curl &> /dev/null; then
+ echo "Error: curl is required but not installed."
+ echo "Install it and try again."
+ exit 1
+fi
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+BOLD='\033[1m'
+NC='\033[0m' # No Color
+
+# =============================================================================
+# Logging functions
+# =============================================================================
+
+log_info() {
+ echo -e "${BLUE}ℹ${NC} $1"
+}
+
+log_success() {
+ echo -e "${GREEN}✓${NC} $1"
+}
+
+log_warn() {
+ echo -e "${YELLOW}⚠${NC} $1"
+}
+
+log_error() {
+ echo -e "${RED}✗${NC} $1"
+}
+
+# =============================================================================
+# Argument parsing
+# =============================================================================
+
+parse_args() {
+ while [[ $# -gt 0 ]]; do
+ case $1 in
+ --help|-h)
+ echo "Browser-Use Lightweight CLI Installer"
+ echo ""
+ echo "Usage: install_lite.sh [OPTIONS]"
+ echo ""
+ echo "Options:"
+ echo " --help, -h Show this help"
+ echo ""
+ echo "Installs Python 3.11+ (if needed), uv, browser-use CLI (minimal deps), and Chromium."
+ exit 0
+ ;;
+ *)
+ log_warn "Unknown argument: $1 (ignored)"
+ shift
+ ;;
+ esac
+ done
+}
+
+# =============================================================================
+# Platform detection
+# =============================================================================
+
+detect_platform() {
+ local os=$(uname -s | tr '[:upper:]' '[:lower:]')
+ local arch=$(uname -m)
+
+ case "$os" in
+ linux*)
+ PLATFORM="linux"
+ ;;
+ darwin*)
+ PLATFORM="macos"
+ ;;
+ msys*|mingw*|cygwin*)
+ PLATFORM="windows"
+ ;;
+ *)
+ log_error "Unsupported OS: $os"
+ exit 1
+ ;;
+ esac
+
+ log_info "Detected platform: $PLATFORM ($arch)"
+}
+
+# =============================================================================
+# Virtual environment helpers
+# =============================================================================
+
+# Get the correct venv bin directory (Scripts on Windows, bin on Unix)
+get_venv_bin_dir() {
+ if [ "$PLATFORM" = "windows" ]; then
+ echo "$HOME/.browser-use-env/Scripts"
+ else
+ echo "$HOME/.browser-use-env/bin"
+ fi
+}
+
+# Activate the virtual environment (handles Windows vs Unix paths)
+activate_venv() {
+ local venv_bin=$(get_venv_bin_dir)
+ if [ -f "$venv_bin/activate" ]; then
+ source "$venv_bin/activate"
+ else
+ log_error "Virtual environment not found at $venv_bin"
+ exit 1
+ fi
+}
+
+# =============================================================================
+# Python management
+# =============================================================================
+
+check_python() {
+ log_info "Checking Python installation..."
+
+ # Check versioned python commands first (python3.13, python3.12, python3.11)
+ # This handles Ubuntu/Debian where python3 may point to older version
+ # Also check common install locations directly in case PATH isn't updated
+ local py_candidates="python3.13 python3.12 python3.11 python3 python"
+ local py_paths="/usr/bin/python3.11 /usr/local/bin/python3.11"
+
+ for py_cmd in $py_candidates; do
+ if command -v "$py_cmd" &> /dev/null; then
+ local version=$($py_cmd --version 2>&1 | awk '{print $2}')
+ local major=$(echo $version | cut -d. -f1)
+ local minor=$(echo $version | cut -d. -f2)
+
+ if [ "$major" -ge 3 ] && [ "$minor" -ge 11 ]; then
+ PYTHON_CMD="$py_cmd"
+ log_success "Python $version found ($py_cmd)"
+ return 0
+ fi
+ fi
+ done
+
+ # Also check common paths directly (in case command -v doesn't find them)
+ for py_path in $py_paths; do
+ if [ -x "$py_path" ]; then
+ local version=$($py_path --version 2>&1 | awk '{print $2}')
+ local major=$(echo $version | cut -d. -f1)
+ local minor=$(echo $version | cut -d. -f2)
+
+ if [ "$major" -ge 3 ] && [ "$minor" -ge 11 ]; then
+ PYTHON_CMD="$py_path"
+ log_success "Python $version found ($py_path)"
+ return 0
+ fi
+ fi
+ done
+
+ # No suitable Python found
+ if command -v python3 &> /dev/null; then
+ local version=$(python3 --version 2>&1 | awk '{print $2}')
+ log_warn "Python $version found, but 3.11+ required"
+ else
+ log_warn "Python not found"
+ fi
+ return 1
+}
+
+install_python() {
+ log_info "Installing Python 3.11+..."
+
+ # Use sudo only if not root and sudo is available
+ SUDO=""
+ if [ "$(id -u)" -ne 0 ] && command -v sudo &> /dev/null; then
+ SUDO="sudo"
+ fi
+
+ case "$PLATFORM" in
+ macos)
+ if command -v brew &> /dev/null; then
+ brew install python@3.11
+ else
+ log_error "Homebrew not found. Install from: https://brew.sh"
+ exit 1
+ fi
+ ;;
+ linux)
+ if command -v apt-get &> /dev/null; then
+ $SUDO apt-get update
+ $SUDO apt-get install -y python3.11 python3.11-venv python3-pip
+ elif command -v yum &> /dev/null; then
+ $SUDO yum install -y python311 python311-pip
+ else
+ log_error "Unsupported package manager. Install Python 3.11+ manually."
+ exit 1
+ fi
+ ;;
+ windows)
+ log_error "Please install Python 3.11+ from: https://www.python.org/downloads/"
+ exit 1
+ ;;
+ esac
+
+ # Verify installation
+ if check_python; then
+ log_success "Python installed successfully"
+ else
+ log_error "Python installation failed"
+ exit 1
+ fi
+}
+
+# =============================================================================
+# uv package manager
+# =============================================================================
+
+install_uv() {
+ log_info "Installing uv package manager..."
+
+ # Add common uv install locations to PATH for current session
+ # (covers both curl-based and Homebrew installs)
+ export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
+
+ if command -v uv &> /dev/null; then
+ log_success "uv already installed"
+ return 0
+ fi
+
+ # Use official uv installer
+ if ! command -v curl &> /dev/null; then
+ log_error "curl is required but not found. Install curl and try again."
+ exit 1
+ fi
+ curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ if command -v uv &> /dev/null; then
+ log_success "uv installed successfully"
+ else
+ log_error "uv installation failed. Try restarting your shell and run the installer again."
+ exit 1
+ fi
+}
+
+# =============================================================================
+# Browser-Use installation (lightweight - CLI deps only)
+# =============================================================================
+
+install_browser_use() {
+ log_info "Installing browser-use (lightweight CLI)..."
+
+ # Create or use existing virtual environment
+ if [ ! -d "$HOME/.browser-use-env" ]; then
+ # Use discovered Python command (e.g., python3.11) or fall back to version spec
+ if [ -n "$PYTHON_CMD" ]; then
+ uv venv "$HOME/.browser-use-env" --python "$PYTHON_CMD"
+ else
+ uv venv "$HOME/.browser-use-env" --python 3.11
+ fi
+ fi
+
+ # Activate venv and install
+ activate_venv
+
+ # Install from GitHub (main branch by default, or custom branch for testing)
+ BROWSER_USE_BRANCH="${BROWSER_USE_BRANCH:-main}"
+ BROWSER_USE_REPO="${BROWSER_USE_REPO:-browser-use/browser-use}"
+ log_info "Installing from GitHub: $BROWSER_USE_REPO@$BROWSER_USE_BRANCH"
+ # Clone and install the package without its declared dependencies,
+ # then install only the minimal deps the CLI actually needs at runtime.
+ # This avoids pulling ~50 packages (LLM clients, PDF tools, etc.) that
+ # the CLI never imports.
+ local tmp_dir=$(mktemp -d)
+ git clone --depth 1 --branch "$BROWSER_USE_BRANCH" "https://github.com/$BROWSER_USE_REPO.git" "$tmp_dir"
+ uv pip install "$tmp_dir" --no-deps
+
+ # Install only the dependencies the CLI actually needs (~10 packages).
+ # The list lives in requirements-cli.txt so it's discoverable and testable.
+ # Transitive deps (e.g. websockets via cdp-use) are resolved automatically.
+ log_info "Installing minimal CLI dependencies..."
+ uv pip install -r "$tmp_dir/browser_use/skill_cli/requirements-cli.txt"
+
+ rm -rf "$tmp_dir"
+
+ log_success "browser-use CLI installed (lightweight)"
+}
+
+install_chromium() {
+ log_info "Installing Chromium browser..."
+
+ activate_venv
+
+ # Build command - only use --with-deps on Linux (it fails on Windows/macOS)
+ local cmd="uvx playwright install chromium"
+ if [ "$PLATFORM" = "linux" ]; then
+ cmd="$cmd --with-deps"
+ fi
+ cmd="$cmd --no-shell"
+
+ eval $cmd
+
+ log_success "Chromium installed"
+}
+
+install_profile_use() {
+ log_info "Installing profile-use..."
+
+ mkdir -p "$HOME/.browser-use/bin"
+ curl -fsSL https://browser-use.com/profile/cli/install.sh | PROFILE_USE_VERSION=v1.0.2 INSTALL_DIR="$HOME/.browser-use/bin" sh
+
+ if [ -x "$HOME/.browser-use/bin/profile-use" ]; then
+ log_success "profile-use installed"
+ else
+ log_warn "profile-use installation failed (will auto-download on first use)"
+ fi
+}
+
+# =============================================================================
+# PATH configuration
+# =============================================================================
+
+configure_path() {
+ local shell_rc=""
+ local bin_path=$(get_venv_bin_dir)
+ local local_bin="$HOME/.local/bin"
+
+ # Detect shell
+ if [ -n "$BASH_VERSION" ]; then
+ shell_rc="$HOME/.bashrc"
+ elif [ -n "$ZSH_VERSION" ]; then
+ shell_rc="$HOME/.zshrc"
+ else
+ shell_rc="$HOME/.profile"
+ fi
+
+ # Check if already in PATH (browser-use-env matches both /bin and /Scripts)
+ if grep -q "browser-use-env" "$shell_rc" 2>/dev/null; then
+ log_info "PATH already configured in $shell_rc"
+ else
+ # Add to shell config (includes ~/.local/bin for tools)
+ echo "" >> "$shell_rc"
+ echo "# Browser-Use" >> "$shell_rc"
+ echo "export PATH=\"$bin_path:$local_bin:\$PATH\"" >> "$shell_rc"
+ log_success "Added to PATH in $shell_rc"
+ fi
+
+ # On Windows, also configure PowerShell profile
+ if [ "$PLATFORM" = "windows" ]; then
+ configure_powershell_path
+ fi
+}
+
+configure_powershell_path() {
+ # Use PowerShell to modify user PATH in registry (no execution policy needed)
+ # This persists across sessions without requiring profile script execution
+
+ local scripts_path='\\.browser-use-env\\Scripts'
+ local local_bin='\\.local\\bin'
+
+ # Check if already in user PATH
+ local current_path=$(powershell.exe -Command "[Environment]::GetEnvironmentVariable('Path', 'User')" 2>/dev/null | tr -d '\r')
+
+ if echo "$current_path" | grep -q "browser-use-env"; then
+ log_info "PATH already configured"
+ return 0
+ fi
+
+ # Append to user PATH via registry (safe, no truncation, no execution policy needed)
+ powershell.exe -Command "[Environment]::SetEnvironmentVariable('Path', [Environment]::GetEnvironmentVariable('Path', 'User') + ';' + \$env:USERPROFILE + '$scripts_path;' + \$env:USERPROFILE + '$local_bin', 'User')" 2>/dev/null
+
+ if [ $? -eq 0 ]; then
+ log_success "Added to Windows PATH: %USERPROFILE%\\.browser-use-env\\Scripts"
+ else
+ log_warn "Could not update PATH automatically. Add manually:"
+ log_warn " \$env:PATH += \";\$env:USERPROFILE\\.browser-use-env\\Scripts\""
+ fi
+}
+
+# =============================================================================
+# Validation
+# =============================================================================
+
+validate() {
+ log_info "Validating installation..."
+
+ activate_venv
+
+ if browser-use doctor; then
+ log_success "Installation validated successfully!"
+ return 0
+ else
+ log_warn "Some checks failed. Run 'browser-use doctor' for details."
+ return 1
+ fi
+}
+
+# =============================================================================
+# Print completion message
+# =============================================================================
+
+print_next_steps() {
+ # Detect shell for source command
+ local shell_rc=".bashrc"
+ if [ -n "$ZSH_VERSION" ]; then
+ shell_rc=".zshrc"
+ fi
+
+ echo ""
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo ""
+ log_success "Browser-Use CLI installed successfully! (lightweight)"
+ echo ""
+
+ echo "Next steps:"
+ if [ "$PLATFORM" = "windows" ]; then
+ echo " 1. Restart PowerShell (PATH is now configured automatically)"
+ else
+ echo " 1. Restart your shell or run: source ~/$shell_rc"
+ fi
+ echo " 2. Try: browser-use open https://example.com"
+ echo ""
+ echo "To install the full library (Agent, LLMs, etc.):"
+ echo " uv pip install browser-use"
+
+ echo ""
+ echo "Documentation: https://docs.browser-use.com"
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo ""
+}
+
+# =============================================================================
+# Main installation flow
+# =============================================================================
+
+main() {
+ echo ""
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo " Browser-Use Lightweight CLI Installer"
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo ""
+
+ # Parse command-line flags
+ parse_args "$@"
+
+ # Step 1: Detect platform
+ detect_platform
+
+ # Step 2: Check/install Python
+ if ! check_python; then
+ # In CI or non-interactive mode (no tty), auto-install Python
+ if [ ! -t 0 ]; then
+ log_info "Python 3.11+ not found. Installing automatically..."
+ install_python
+ else
+ read -p "Python 3.11+ not found. Install now? [y/N] " -n 1 -r < /dev/tty
+ echo
+ if [[ $REPLY =~ ^[Yy]$ ]]; then
+ install_python
+ else
+ log_error "Python 3.11+ required. Exiting."
+ exit 1
+ fi
+ fi
+ fi
+
+ # Step 3: Install uv
+ install_uv
+
+ # Step 4: Install browser-use package (minimal deps only)
+ install_browser_use
+
+ # Step 5: Install Chromium
+ install_chromium
+
+ # Step 6: Install profile-use
+ install_profile_use
+
+ # Step 7: Configure PATH
+ configure_path
+
+ # Step 8: Validate (non-fatal — warnings shouldn't block next-step instructions)
+ validate || true
+
+ # Step 9: Print next steps
+ print_next_steps
+}
+
+# Run main function with all arguments
+main "$@"
diff --git a/browser_use/skill_cli/main.py b/browser_use/skill_cli/main.py
index 0d4b1e84f..cbf4c36a0 100755
--- a/browser_use/skill_cli/main.py
+++ b/browser_use/skill_cli/main.py
@@ -2,25 +2,27 @@
"""Fast CLI for browser-use. STDLIB ONLY - must start in <50ms.
This is the main entry point for the browser-use CLI. It uses only stdlib
-imports to ensure fast startup, delegating heavy operations to the session
-server which loads once and stays running.
+imports to ensure fast startup, delegating heavy operations to the daemon
+which loads once and stays running.
"""
import argparse
import asyncio
-import hashlib
import json
import os
+import re
+import signal
import socket
import subprocess
import sys
import tempfile
import time
+import zlib
from pathlib import Path
# =============================================================================
# Early command interception (before heavy imports)
-# These commands don't need the session server infrastructure
+# These commands don't need the daemon infrastructure
# =============================================================================
# Handle --mcp flag early to prevent logging initialization
@@ -124,161 +126,414 @@ if '--template' in sys.argv:
# Keep --force/-f and --list/-l flags
elif arg in ('--force', '-f', '--list', '-l'):
new_argv.append(arg)
- # Skip other flags (--session, --browser, --headed, etc.)
+ # Skip other flags (--headed, etc.)
i += 1
sys.argv = new_argv
init_main()
sys.exit(0)
+# Handle 'cloud --help' / 'cloud -h' early — argparse intercepts --help before
+# REMAINDER can capture it, so we route to our custom usage printer directly.
+# Only intercept when --help is immediately after 'cloud' (not 'cloud v2 --help').
+if _get_subcommand() == 'cloud':
+ cloud_idx = sys.argv.index('cloud')
+ if cloud_idx + 1 < len(sys.argv) and sys.argv[cloud_idx + 1] in ('--help', '-h'):
+ from browser_use.skill_cli.commands.cloud import handle_cloud_command
+
+ sys.exit(handle_cloud_command(['--help']))
+
# =============================================================================
# Utility functions (inlined to avoid imports)
# =============================================================================
-def get_socket_path(session: str) -> str:
- """Get socket path for session."""
- if sys.platform == 'win32':
- # Use 127.0.0.1 explicitly (not localhost) to avoid IPv6 binding issues
- port = 49152 + (int(hashlib.md5(session.encode()).hexdigest()[:4], 16) % 16383)
- return f'tcp://127.0.0.1:{port}'
- return str(Path(tempfile.gettempdir()) / f'browser-use-{session}.sock')
+def _get_home_dir() -> Path:
+ """Get browser-use home directory.
+
+ Must match utils.get_home_dir().
+ """
+ env = os.environ.get('BROWSER_USE_HOME')
+ if env:
+ d = Path(env).expanduser()
+ else:
+ d = Path.home() / '.browser-use'
+ d.mkdir(parents=True, exist_ok=True)
+ return d
-def get_pid_path(session: str) -> Path:
- """Get PID file path for session."""
- return Path(tempfile.gettempdir()) / f'browser-use-{session}.pid'
+def _get_socket_path(session: str = 'default') -> str:
+ """Get daemon socket path for a session.
-
-def _pid_exists(pid: int) -> bool:
- """Check if a process with given PID exists.
-
- On Windows, uses ctypes to call OpenProcess (os.kill doesn't work reliably).
- On Unix, uses os.kill(pid, 0) which is the standard approach.
+ Must match utils.get_socket_path().
"""
if sys.platform == 'win32':
- import ctypes
-
- PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
- handle = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, False, pid)
- if handle:
- ctypes.windll.kernel32.CloseHandle(handle)
- return True
- return False
- else:
- try:
- os.kill(pid, 0)
- return True
- except OSError:
- return False
+ port = 49152 + zlib.adler32(session.encode()) % 16383
+ return f'tcp://127.0.0.1:{port}'
+ return str(_get_home_dir() / f'{session}.sock')
-def is_server_running(session: str) -> bool:
- """Check if server is running for session."""
- pid_path = get_pid_path(session)
- if not pid_path.exists():
- return False
+def _get_pid_path(session: str = 'default') -> Path:
+ """Get PID file path for a session.
+
+ Must match utils.get_pid_path().
+ """
+ return _get_home_dir() / f'{session}.pid'
+
+
+def _read_auth_token(session: str = 'default') -> str:
+ """Read per-session auth token written by the daemon.
+
+ Must match utils.get_auth_token_path().
+ Returns empty string if the token file is missing (pre-auth daemon).
+ """
+ token_path = _get_home_dir() / f'{session}.token'
try:
- pid = int(pid_path.read_text().strip())
- return _pid_exists(pid)
- except (OSError, ValueError):
- # Can't read PID file or invalid PID
- return False
+ return token_path.read_text().strip()
+ except OSError:
+ return ''
-def connect_to_server(session: str, timeout: float = 60.0) -> socket.socket:
- """Connect to session server."""
- sock_path = get_socket_path(session)
+def _connect_to_daemon(timeout: float = 60.0, session: str = 'default') -> socket.socket:
+ """Connect to daemon socket."""
+ sock_path = _get_socket_path(session)
if sock_path.startswith('tcp://'):
- # Windows: TCP connection
_, hostport = sock_path.split('://', 1)
host, port = hostport.split(':')
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- sock.settimeout(timeout)
- sock.connect((host, int(port)))
+ addr: str | tuple[str, int] = (host, int(port))
else:
- # Unix socket
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ addr = sock_path
+
+ try:
sock.settimeout(timeout)
- sock.connect(sock_path)
+ sock.connect(addr)
+ except Exception:
+ sock.close()
+ raise
return sock
-def get_session_metadata_path(session: str) -> Path:
- """Get path to session metadata file (stores browser_mode, headed, profile)."""
- return Path(tempfile.gettempdir()) / f'browser-use-{session}.meta'
+def _is_pid_alive(pid: int) -> bool:
+ """Check if a process with the given PID exists. Cross-platform."""
+ if sys.platform == 'win32':
+ import ctypes
+
+ _PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
+ handle = ctypes.windll.kernel32.OpenProcess(_PROCESS_QUERY_LIMITED_INFORMATION, False, pid)
+ if handle:
+ ctypes.windll.kernel32.CloseHandle(handle)
+ return True
+ return False
+ try:
+ os.kill(pid, 0)
+ return True
+ except (OSError, ProcessLookupError):
+ return False
-def ensure_server(session: str, browser: str, headed: bool, profile: str | None, api_key: str | None) -> bool:
- """Start server if not running. Returns True if started."""
- from browser_use.skill_cli.utils import is_session_locked, kill_orphaned_server
+def _is_daemon_process(pid: int) -> bool:
+ """Check if the process at PID is a browser-use daemon. Cross-platform."""
+ _marker = 'browser_use.skill_cli.daemon'
+ try:
+ if sys.platform == 'linux':
+ cmdline = Path(f'/proc/{pid}/cmdline').read_bytes().decode(errors='replace')
+ return _marker in cmdline
+ elif sys.platform == 'win32':
+ # Use wmic to get the command line on Windows
+ import subprocess as _sp
- meta_path = get_session_metadata_path(session)
+ result = _sp.run(
+ ['wmic', 'process', 'where', f'ProcessId={pid}', 'get', 'CommandLine', '/format:list'],
+ capture_output=True,
+ text=True,
+ timeout=5,
+ )
+ return _marker in result.stdout
+ else:
+ # macOS and other POSIX
+ import subprocess as _sp
- # Check if server is already running AND holding its lock (healthy server)
- if is_server_running(session) and is_session_locked(session):
+ result = _sp.run(['ps', '-p', str(pid), '-o', 'command='], capture_output=True, text=True, timeout=5)
+ return _marker in result.stdout
+ except Exception:
+ return False
+
+
+def _terminate_pid(pid: int) -> bool:
+ """Best-effort terminate a process. Returns True if confirmed dead.
+
+ POSIX: SIGTERM, poll 5s, escalate to SIGKILL.
+ Windows: TerminateProcess (hard kill, skips all daemon cleanup).
+ """
+ if sys.platform == 'win32':
+ import ctypes
+
+ _PROCESS_TERMINATE = 0x0001
+ handle = ctypes.windll.kernel32.OpenProcess(_PROCESS_TERMINATE, False, pid)
+ if handle:
+ ctypes.windll.kernel32.TerminateProcess(handle, 1)
+ ctypes.windll.kernel32.CloseHandle(handle)
+ return not _is_pid_alive(pid)
+
+ try:
+ os.kill(pid, signal.SIGTERM)
+ except (OSError, ProcessLookupError):
+ return True
+
+ # Poll for exit
+ for _ in range(50): # 5s at 100ms intervals
+ time.sleep(0.1)
+ if not _is_pid_alive(pid):
+ return True
+
+ # Escalate to SIGKILL
+ try:
+ os.kill(pid, signal.SIGKILL)
+ except (OSError, ProcessLookupError):
+ return True
+ time.sleep(0.2)
+ return not _is_pid_alive(pid)
+
+
+def _read_session_state(session: str) -> dict | None:
+ """Read session state file. Returns None if missing or corrupt."""
+ state_path = _get_home_dir() / f'{session}.state.json'
+ if not state_path.exists():
+ return None
+ try:
+ return json.loads(state_path.read_text())
+ except (json.JSONDecodeError, OSError):
+ return None
+
+
+def _get_state_path(session: str) -> Path:
+ return _get_home_dir() / f'{session}.state.json'
+
+
+class _SessionProbe:
+ """Snapshot of a session's health. Never deletes anything — callers decide cleanup."""
+
+ __slots__ = ('name', 'phase', 'updated_at', 'pid', 'pid_alive', 'socket_reachable', 'socket_pid')
+
+ def __init__(
+ self,
+ name: str,
+ phase: str | None = None,
+ updated_at: float | None = None,
+ pid: int | None = None,
+ pid_alive: bool = False,
+ socket_reachable: bool = False,
+ socket_pid: int | None = None,
+ ):
+ self.name = name
+ self.phase = phase
+ self.updated_at = updated_at
+ self.pid = pid
+ self.pid_alive = pid_alive
+ self.socket_reachable = socket_reachable
+ self.socket_pid = socket_pid
+
+
+def _probe_session(session: str) -> _SessionProbe:
+ """Non-destructive probe of a session's state. Never deletes files."""
+ probe = _SessionProbe(name=session)
+
+ # 1. Read state file
+ state = _read_session_state(session)
+ state_pid: int | None = None
+ if state:
+ probe.phase = state.get('phase')
+ probe.updated_at = state.get('updated_at')
+ state_pid = state.get('pid')
+
+ # 2. Read PID file
+ pid_file_pid: int | None = None
+ pid_path = _get_pid_path(session)
+ if pid_path.exists():
try:
- sock = connect_to_server(session, timeout=0.5) # Increased from 0.1s
- sock.close()
+ pid_file_pid = int(pid_path.read_text().strip())
+ except (OSError, ValueError):
+ pass
- # Check browser mode matches existing session
- if meta_path.exists():
- try:
- meta = json.loads(meta_path.read_text())
- existing_mode = meta.get('browser_mode', 'chromium')
- if existing_mode != browser:
- # Only error if user explicitly requested 'remote' but session is local
- # This prevents losing cloud features (live_url, etc.)
- # The reverse case (requesting local but having remote) is fine -
- # user still gets a working browser, just with more features
- if browser == 'remote' and existing_mode != 'remote':
- print(
- f"Error: Session '{session}' is running with --browser {existing_mode}, "
- f'but --browser remote was requested.\n\n'
- f'Cloud browser features (live_url) require a remote session.\n\n'
- f'Options:\n'
- f' 1. Close and restart: browser-use close && browser-use --browser remote open \n'
- f' 2. Use different session: browser-use --browser remote --session other \n'
- f' 3. Use existing local browser: browser-use --browser {existing_mode} ',
- file=sys.stderr,
- )
- sys.exit(1)
- except (json.JSONDecodeError, OSError):
- pass # Metadata file corrupt, ignore
-
- return False # Already running with correct mode
+ # 3. Try socket connect + ping for PID (before reconciliation)
+ try:
+ sock = _connect_to_daemon(timeout=0.5, session=session)
+ sock.close()
+ probe.socket_reachable = True
+ try:
+ resp = send_command('ping', {}, session=session)
+ if resp.get('success'):
+ probe.socket_pid = resp.get('data', {}).get('pid')
except Exception:
- pass # Server not responsive, continue to restart logic
+ pass
+ except OSError:
+ probe.socket_reachable = False
- # Kill any orphaned server (has PID file but no lock)
- kill_orphaned_server(session)
+ # 4. Reconcile PIDs
+ state_alive = bool(state_pid and _is_pid_alive(state_pid))
+ pidfile_alive = bool(pid_file_pid and _is_pid_alive(pid_file_pid))
- # Build server command
+ if state_alive and pidfile_alive and state_pid != pid_file_pid:
+ # Split-brain: both PIDs alive but different.
+ # Use socket_pid to break the tie.
+ if probe.socket_pid == state_pid:
+ probe.pid = state_pid
+ elif probe.socket_pid == pid_file_pid:
+ probe.pid = pid_file_pid
+ else:
+ # Socket unreachable or answers with unknown PID — can't resolve
+ probe.pid = pid_file_pid # .pid file is written later, so prefer it
+ probe.pid_alive = True
+ elif state_alive:
+ probe.pid = state_pid
+ probe.pid_alive = True
+ elif pidfile_alive:
+ probe.pid = pid_file_pid
+ probe.pid_alive = True
+ else:
+ probe.pid = state_pid or pid_file_pid
+ probe.pid_alive = False
+
+ return probe
+
+
+def _clean_session_files(session: str) -> None:
+ """Remove all files for a session (state, PID, socket)."""
+ _get_state_path(session).unlink(missing_ok=True)
+ _get_pid_path(session).unlink(missing_ok=True)
+ sock_path = _get_socket_path(session)
+ if not sock_path.startswith('tcp://'):
+ Path(sock_path).unlink(missing_ok=True)
+
+
+def _is_daemon_alive(session: str = 'default') -> bool:
+ """Check if daemon is alive by socket reachability."""
+ return _probe_session(session).socket_reachable
+
+
+def ensure_daemon(
+ headed: bool,
+ profile: str | None,
+ cdp_url: str | None = None,
+ *,
+ session: str = 'default',
+ explicit_config: bool = False,
+ use_cloud: bool = False,
+ cloud_profile_id: str | None = None,
+ cloud_proxy_country_code: str | None = None,
+ cloud_timeout: int | None = None,
+) -> None:
+ """Start daemon if not running. Uses state file for phase-aware decisions."""
+ probe = _probe_session(session)
+
+ # Socket reachable — daemon is alive and responding
+ if probe.socket_reachable:
+ if not explicit_config:
+ return # Reuse it
+
+ # User explicitly set --headed/--profile/--cdp-url — check config matches
+ try:
+ response = send_command('ping', {}, session=session)
+ if response.get('success'):
+ data = response.get('data', {})
+ if (
+ data.get('headed') == headed
+ and data.get('profile') == profile
+ and data.get('cdp_url') == cdp_url
+ and data.get('use_cloud') == use_cloud
+ ):
+ return # Already running with correct config
+
+ # Config mismatch — error, don't auto-restart (avoids orphan cascades)
+ print(
+ f'Error: Session {session!r} is already running with different config.\n'
+ f'Run `browser-use{" --session " + session if session != "default" else ""} close` first.',
+ file=sys.stderr,
+ )
+ sys.exit(1)
+ return # Ping returned failure — daemon alive but can't verify config, reuse it
+ except Exception:
+ return # Daemon alive but not responsive — reuse it, can't safely restart
+
+ # Socket unreachable but process alive — phase-aware decisions
+ if probe.pid_alive and probe.phase:
+ now = time.time()
+ age = now - probe.updated_at if probe.updated_at else float('inf')
+
+ if probe.phase == 'initializing' and age < 15:
+ # Daemon is booting, wait for socket
+ for _ in range(30):
+ time.sleep(0.5)
+ if _is_daemon_alive(session):
+ return
+ # Still not reachable — fall through to error
+
+ elif probe.phase in ('starting', 'ready', 'running') and age < 60:
+ # Daemon is alive but socket broke, or starting browser
+ print(
+ f'Error: Session {session!r} is alive (phase={probe.phase}) but socket unreachable.\n'
+ f'Run `browser-use{" --session " + session if session != "default" else ""} close` first.',
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+ elif probe.phase == 'shutting_down' and age < 15:
+ # Daemon is shutting down, wait for it to finish
+ for _ in range(30):
+ time.sleep(0.5)
+ if not probe.pid or not _is_pid_alive(probe.pid):
+ break
+ # Fall through to spawn
+
+ # Stale phase — daemon stuck or crashed without terminal state
+ elif probe.pid and _is_daemon_process(probe.pid):
+ _terminate_pid(probe.pid)
+
+ # Clean up stale files before spawning
+ _clean_session_files(session)
+
+ # Build daemon command
cmd = [
sys.executable,
'-m',
- 'browser_use.skill_cli.server',
+ 'browser_use.skill_cli.daemon',
'--session',
session,
- '--browser',
- browser,
]
if headed:
cmd.append('--headed')
if profile:
cmd.extend(['--profile', profile])
+ if cdp_url:
+ cmd.extend(['--cdp-url', cdp_url])
+ if use_cloud:
+ cmd.append('--use-cloud')
+ if cloud_profile_id is not None:
+ cmd.extend(['--cloud-profile-id', cloud_profile_id])
+ if cloud_proxy_country_code is not None:
+ cmd.extend(['--cloud-proxy-country', cloud_proxy_country_code])
+ if cloud_timeout is not None:
+ cmd.extend(['--cloud-timeout', str(cloud_timeout)])
# Set up environment
env = os.environ.copy()
- if api_key:
- env['BROWSER_USE_API_KEY'] = api_key
- # Start server as background process
+ # For cloud mode, inject API key from config.json into daemon env.
+ # The library's CloudBrowserClient reads BROWSER_USE_API_KEY env var directly,
+ # so we inject it to prevent fallback to ~/.config/browseruse/cloud_auth.json.
+ if use_cloud:
+ from browser_use.skill_cli.config import get_config_value
+
+ cli_api_key = get_config_value('api_key')
+ if cli_api_key:
+ env['BROWSER_USE_API_KEY'] = str(cli_api_key)
+
+ # Start daemon as background process
if sys.platform == 'win32':
- # Windows: CREATE_NO_WINDOW prevents console window from appearing
- # CREATE_NEW_PROCESS_GROUP allows the process to survive parent exit
subprocess.Popen(
cmd,
env=env,
@@ -287,7 +542,6 @@ def ensure_server(session: str, browser: str, headed: bool, profile: str | None,
stderr=subprocess.DEVNULL,
)
else:
- # Unix: use start_new_session
subprocess.Popen(
cmd,
env=env,
@@ -296,43 +550,36 @@ def ensure_server(session: str, browser: str, headed: bool, profile: str | None,
stderr=subprocess.DEVNULL,
)
- # Wait for server to be ready (must have PID, lock, and responsive socket)
- for _ in range(100): # 5 seconds max
- if is_server_running(session) and is_session_locked(session):
- try:
- sock = connect_to_server(session, timeout=0.5)
- sock.close()
+ # Wait for daemon to be ready — use state file for phase-aware waiting
+ deadline = time.time() + 15
+ while time.time() < deadline:
+ probe = _probe_session(session)
+ if probe.socket_reachable:
+ return
+ # Daemon wrote state and PID is alive — still booting, keep waiting
+ if probe.pid_alive and probe.phase in ('initializing', 'ready', 'starting', 'running'):
+ time.sleep(0.2)
+ continue
+ # Daemon wrote terminal state — startup failed
+ if probe.phase in ('failed', 'stopped'):
+ break
+ time.sleep(0.2)
- # Write metadata file to track session config
- meta_path.write_text(
- json.dumps(
- {
- 'browser_mode': browser,
- 'headed': headed,
- 'profile': profile,
- }
- )
- )
-
- return True
- except Exception:
- pass
- time.sleep(0.05)
-
- print('Error: Failed to start session server', file=sys.stderr)
+ print('Error: Failed to start daemon', file=sys.stderr)
sys.exit(1)
-def send_command(session: str, action: str, params: dict) -> dict:
- """Send command to server and get response."""
+def send_command(action: str, params: dict, *, session: str = 'default', agent_id: str = '__shared__') -> dict:
+ """Send command to daemon and get response."""
request = {
'id': f'r{int(time.time() * 1000000) % 1000000}',
'action': action,
- 'session': session,
'params': params,
+ 'agent_id': agent_id,
+ 'token': _read_auth_token(session),
}
- sock = connect_to_server(session)
+ sock = _connect_to_daemon(session=session)
try:
# Send request
sock.sendall((json.dumps(request) + '\n').encode())
@@ -346,7 +593,7 @@ def send_command(session: str, action: str, params: dict) -> dict:
data += chunk
if not data:
- return {'id': request['id'], 'success': False, 'error': 'No response from server'}
+ return {'id': request['id'], 'success': False, 'error': 'No response from daemon'}
return json.loads(data.decode())
finally:
@@ -360,55 +607,20 @@ def send_command(session: str, action: str, params: dict) -> dict:
def build_parser() -> argparse.ArgumentParser:
"""Build argument parser with all commands."""
- # Import install config to get available modes and default
- from browser_use.skill_cli.install_config import get_available_modes, get_default_mode
-
- available_modes = get_available_modes()
- default_mode = get_default_mode()
-
- # Build epilog dynamically based on available modes
+ # Build epilog
epilog_parts = []
- if 'chromium' in available_modes or 'real' in available_modes:
- epilog_parts.append("""Local Mode (default):
- browser-use run "Fill the form" # Uses local browser + your API keys
- browser-use run "task" --llm gpt-4o # Specify model (requires API key)
- browser-use open https://example.com""")
-
- if 'remote' in available_modes:
- if 'chromium' in available_modes:
- # Full install - show how to switch to remote
- epilog_parts.append("""
-Remote Mode (--browser remote):
- browser-use -b remote run "task" # Cloud execution (US proxy default)
- browser-use -b remote run "task" --llm gpt-4o # Specify cloud model
- browser-use -b remote --profile run "task" # Use cloud profile
- browser-use -b remote run "task" --proxy-country gb # UK proxy
- browser-use -b remote run "task" --session-id # Reuse session
- browser-use -b remote run "task" --wait # Wait for completion
-
-Task Management:
- browser-use task list # List recent cloud tasks
- browser-use task status # Check task status
- browser-use task stop # Stop running task""")
- else:
- # Remote-only install
- epilog_parts.append("""
-Examples:
- browser-use run "task" # Cloud execution (US proxy default)
- browser-use run "task" --llm gpt-4o # Specify model
- browser-use --profile run "task" # Use cloud profile
- browser-use run "task" --proxy-country gb # UK proxy
- browser-use run "task" --session-id # Reuse existing session
- browser-use run "task" --wait # Wait for completion
-
-Task Management:
- browser-use task list # List recent cloud tasks
- browser-use task status # Check task status
- browser-use task stop # Stop running task""")
+ epilog_parts.append("""Cloud API:
+ browser-use cloud login # Save API key
+ browser-use cloud connect # Provision cloud browser
+ browser-use cloud v2 GET /browsers # List browsers
+ browser-use cloud v2 POST /tasks '{...}' # Create task
+ browser-use cloud v2 poll # Poll task until done
+ browser-use cloud v2 --help # Show API endpoints""")
epilog_parts.append("""
Setup:
+ browser-use open https://example.com # Navigate to URL
browser-use install # Install Chromium browser
browser-use init # Generate template file""")
@@ -420,18 +632,27 @@ Setup:
)
# Global flags
- parser.add_argument('--session', '-s', default='default', help='Session name (default: default)')
- parser.add_argument(
- '--browser',
- '-b',
- choices=available_modes,
- default=default_mode,
- help=f'Browser mode (available: {", ".join(available_modes)})',
- )
parser.add_argument('--headed', action='store_true', help='Show browser window')
- parser.add_argument('--profile', help='Browser profile (local name or cloud ID)')
+ parser.add_argument(
+ '--profile',
+ nargs='?',
+ const='Default',
+ default=None,
+ help='Use real Chrome with profile (bare --profile uses "Default")',
+ )
+ parser.add_argument(
+ '--cdp-url',
+ default=None,
+ help='Connect to existing browser via CDP URL (http:// or ws://)',
+ )
+ parser.add_argument(
+ '--connect',
+ action='store_true',
+ default=False,
+ help='(Deprecated) Use "browser-use connect" instead',
+ )
+ parser.add_argument('--session', default=None, help='Session name (default: "default")')
parser.add_argument('--json', action='store_true', help='Output as JSON')
- parser.add_argument('--api-key', help='Browser-Use API key')
parser.add_argument('--mcp', action='store_true', help='Run as MCP server (JSON-RPC via stdin/stdout)')
parser.add_argument('--template', help='Generate template file (use with --output for custom path)')
@@ -444,6 +665,8 @@ Setup:
# install
subparsers.add_parser('install', help='Install Chromium browser + system dependencies')
+ # register
+
# init
p = subparsers.add_parser('init', help='Generate browser-use template file')
p.add_argument('--template', '-t', help='Template name (interactive if not specified)')
@@ -453,13 +676,26 @@ Setup:
# setup
p = subparsers.add_parser('setup', help='Configure browser-use for first-time use')
- p.add_argument('--mode', choices=['local', 'remote', 'full'], default='local', help='Setup mode (local/remote/full)')
- p.add_argument('--api-key', help='Browser-Use API key')
p.add_argument('--yes', '-y', action='store_true', help='Skip interactive prompts')
# doctor
subparsers.add_parser('doctor', help='Check browser-use installation and dependencies')
+ # connect (to local Chrome)
+ subparsers.add_parser('connect', help='Connect to running Chrome via CDP')
+
+ # config
+ config_p = subparsers.add_parser('config', help='Manage CLI configuration')
+ config_sub = config_p.add_subparsers(dest='config_command')
+ p = config_sub.add_parser('set', help='Set a config value')
+ p.add_argument('key', help='Config key')
+ p.add_argument('value', help='Config value')
+ p = config_sub.add_parser('get', help='Get a config value')
+ p.add_argument('key', help='Config key')
+ config_sub.add_parser('list', help='List all config values')
+ p = config_sub.add_parser('unset', help='Remove a config value')
+ p.add_argument('key', help='Config key')
+
# -------------------------------------------------------------------------
# Browser Control Commands
# -------------------------------------------------------------------------
@@ -468,16 +704,16 @@ Setup:
p = subparsers.add_parser('open', help='Navigate to URL')
p.add_argument('url', help='URL to navigate to')
- # click
- p = subparsers.add_parser('click', help='Click element by index')
- p.add_argument('index', type=int, help='Element index from state')
+ # click OR click
+ p = subparsers.add_parser('click', help='Click element by index or coordinates (x y)')
+ p.add_argument('args', nargs='+', type=int, help='Element index OR x y coordinates')
# type
p = subparsers.add_parser('type', help='Type text')
p.add_argument('text', help='Text to type')
# input
- p = subparsers.add_parser('input', help='Type text into specific element')
+ p = subparsers.add_parser('input', help='Clear-then-type into specific element; pass "" to clear only')
p.add_argument('index', type=int, help='Element index')
p.add_argument('text', help='Text to type')
@@ -497,13 +733,20 @@ Setup:
# state
subparsers.add_parser('state', help='Get browser state (URL, title, elements)')
- # switch
- p = subparsers.add_parser('switch', help='Switch to tab')
+ # tab (list, switch, close)
+ tab_p = subparsers.add_parser('tab', help='Tab management (list, switch, close)')
+ tab_sub = tab_p.add_subparsers(dest='tab_command')
+
+ tab_sub.add_parser('list', help='List all tabs with lock status')
+
+ p = tab_sub.add_parser('new', help='Open a new blank tab')
+ p.add_argument('url', nargs='?', default='about:blank', help='URL to open (default: about:blank)')
+
+ p = tab_sub.add_parser('switch', help='Switch to tab')
p.add_argument('tab', type=int, help='Tab index')
- # close-tab [tab]
- p = subparsers.add_parser('close-tab', help='Close tab')
- p.add_argument('tab', type=int, nargs='?', help='Tab index (current if not specified)')
+ p = tab_sub.add_parser('close', help='Close tab(s)')
+ p.add_argument('tabs', type=int, nargs='*', help='Tab indices to close (current if none)')
# keys
p = subparsers.add_parser('keys', help='Send keyboard keys')
@@ -514,6 +757,11 @@ Setup:
p.add_argument('index', type=int, help='Element index')
p.add_argument('value', help='Value to select')
+ # upload
+ p = subparsers.add_parser('upload', help='Upload file to file input element')
+ p.add_argument('index', type=int, help='Element index of file input')
+ p.add_argument('path', help='Path to file to upload')
+
# eval
p = subparsers.add_parser('eval', help='Execute JavaScript')
p.add_argument('js', help='JavaScript code to execute')
@@ -627,127 +875,6 @@ Setup:
p.add_argument('--reset', action='store_true', help='Reset Python namespace')
p.add_argument('--vars', action='store_true', help='Show defined variables')
- # -------------------------------------------------------------------------
- # Agent Tasks
- # -------------------------------------------------------------------------
-
- from browser_use.skill_cli.install_config import is_mode_available
-
- remote_available = is_mode_available('remote')
- local_available = is_mode_available('chromium')
-
- p = subparsers.add_parser('run', help='Run agent task (requires API key)')
- p.add_argument('task', help='Task description')
- p.add_argument('--max-steps', type=int, help='Maximum steps')
- # Model selection (works both locally and remotely)
- p.add_argument('--llm', help='LLM model (gpt-4o, claude-sonnet-4-20250514, gemini-2.0-flash)')
-
- # Cloud-only flags - only show if remote mode is available
- if remote_available:
- # Add [remote] hint only if both modes are available (--full install)
- remote_hint = '[remote] ' if local_available else ''
- p.add_argument('--session-id', help=f'{remote_hint}Reuse existing cloud session ID')
- p.add_argument('--proxy-country', help=f'{remote_hint}Proxy country code')
- p.add_argument('--stream', action='store_true', help=f'{remote_hint}Stream output in real-time')
- p.add_argument('--wait', action='store_true', help=f'{remote_hint}Wait for task to complete (default: async)')
- p.add_argument('--flash', action='store_true', help=f'{remote_hint}Enable flash mode')
- p.add_argument('--keep-alive', action='store_true', help=f'{remote_hint}Keep session alive after task')
- p.add_argument('--thinking', action='store_true', help=f'{remote_hint}Enable extended reasoning')
- p.add_argument('--vision', action='store_true', default=None, help=f'{remote_hint}Enable vision')
- p.add_argument('--no-vision', action='store_true', help=f'{remote_hint}Disable vision')
- # New SDK features
- p.add_argument('--start-url', help=f'{remote_hint}URL to start the task from')
- p.add_argument('--metadata', action='append', metavar='KEY=VALUE', help=f'{remote_hint}Task metadata (can repeat)')
- p.add_argument('--secret', action='append', metavar='KEY=VALUE', help=f'{remote_hint}Task secrets (can repeat)')
- p.add_argument(
- '--allowed-domain',
- action='append',
- metavar='DOMAIN',
- help=f'{remote_hint}Restrict navigation to domains (can repeat)',
- )
- p.add_argument('--skill-id', action='append', metavar='ID', help=f'{remote_hint}Enable skill IDs (can repeat)')
- p.add_argument('--structured-output', metavar='SCHEMA', help=f'{remote_hint}JSON schema for structured output')
- p.add_argument('--judge', action='store_true', help=f'{remote_hint}Enable judge mode')
- p.add_argument('--judge-ground-truth', metavar='TEXT', help=f'{remote_hint}Expected answer for judge evaluation')
-
- # -------------------------------------------------------------------------
- # Task Management (Cloud) - only available if remote mode is installed
- # -------------------------------------------------------------------------
-
- if remote_available:
- task_p = subparsers.add_parser('task', help='Manage cloud tasks')
- task_sub = task_p.add_subparsers(dest='task_command')
-
- # task list
- p = task_sub.add_parser('list', help='List recent tasks')
- p.add_argument('--limit', type=int, default=10, help='Maximum number of tasks to list')
- p.add_argument('--status', choices=['running', 'finished', 'stopped', 'failed'], help='Filter by status')
- p.add_argument('--session', help='Filter by session ID')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # task status
- p = task_sub.add_parser('status', help='Get task status')
- p.add_argument('task_id', help='Task ID')
- p.add_argument('--compact', '-c', action='store_true', help='Show all steps with reasoning')
- p.add_argument('--verbose', '-v', action='store_true', help='Show all steps with full details (URLs, actions)')
- p.add_argument('--last', '-n', type=int, metavar='N', help='Show only the last N steps')
- p.add_argument('--reverse', '-r', action='store_true', help='Show steps newest first (100, 99, 98...)')
- p.add_argument('--step', '-s', type=int, metavar='N', help='Show specific step number')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # task stop
- p = task_sub.add_parser('stop', help='Stop running task')
- p.add_argument('task_id', help='Task ID')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # task logs
- p = task_sub.add_parser('logs', help='Get task logs')
- p.add_argument('task_id', help='Task ID')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # -------------------------------------------------------------------------
- # Cloud Session Management - only available if remote mode is installed
- # -------------------------------------------------------------------------
-
- if remote_available:
- session_p = subparsers.add_parser('session', help='Manage cloud sessions')
- session_sub = session_p.add_subparsers(dest='session_command')
-
- # session list
- p = session_sub.add_parser('list', help='List cloud sessions')
- p.add_argument('--limit', type=int, default=10, help='Maximum number of sessions to list')
- p.add_argument('--status', choices=['active', 'stopped'], help='Filter by status')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # session get
- p = session_sub.add_parser('get', help='Get session details')
- p.add_argument('session_id', help='Session ID')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # session stop or session stop --all
- p = session_sub.add_parser('stop', help='Stop cloud session(s)')
- p.add_argument('session_id', nargs='?', help='Session ID (or use --all)')
- p.add_argument('--all', action='store_true', help='Stop all active sessions')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # session create - Create session without task
- p = session_sub.add_parser('create', help='Create a new cloud session')
- p.add_argument('--profile', help='Cloud profile ID')
- p.add_argument('--proxy-country', help='Proxy country code')
- p.add_argument('--start-url', help='Initial URL to navigate to')
- p.add_argument('--screen-size', metavar='WxH', help='Screen size (e.g., 1920x1080)')
- p.add_argument('--keep-alive', action='store_true', default=None, help='Keep session alive')
- p.add_argument('--no-keep-alive', dest='keep_alive', action='store_false', help='Do not keep session alive')
- p.add_argument('--persist-memory', action='store_true', default=None, help='Persist memory between tasks')
- p.add_argument('--no-persist-memory', dest='persist_memory', action='store_false', help='Do not persist memory')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
- # session share - Create or delete public share
- p = session_sub.add_parser('share', help='Manage public share URL')
- p.add_argument('session_id', help='Session ID')
- p.add_argument('--delete', action='store_true', help='Delete the public share')
- p.add_argument('--json', action='store_true', help='Output as JSON')
-
# -------------------------------------------------------------------------
# Tunnel Commands
# -------------------------------------------------------------------------
@@ -766,190 +893,277 @@ Setup:
# Session Management
# -------------------------------------------------------------------------
- # sessions
- subparsers.add_parser('sessions', help='List active sessions')
-
# close
- p = subparsers.add_parser('close', help='Close session')
- p.add_argument('--all', action='store_true', help='Close all sessions')
+ close_p = subparsers.add_parser('close', help='Close browser and stop daemon')
+ close_p.add_argument('--all', action='store_true', help='Close all sessions')
+
+ # sessions
+ subparsers.add_parser('sessions', help='List active browser sessions')
# -------------------------------------------------------------------------
- # Server Control
+ # Cloud API (Generic REST passthrough)
# -------------------------------------------------------------------------
- server_p = subparsers.add_parser('server', help='Server control')
- server_sub = server_p.add_subparsers(dest='server_command')
- server_sub.add_parser('status', help='Check server status')
- server_sub.add_parser('stop', help='Stop server')
- server_sub.add_parser('logs', help='View server logs')
+ cloud_p = subparsers.add_parser('cloud', help='Browser-Use Cloud API')
+ cloud_p.add_argument('cloud_args', nargs=argparse.REMAINDER, help='cloud subcommand args')
# -------------------------------------------------------------------------
- # Profile Management (mode-aware: use -b real or -b remote)
+ # Profile Management
# -------------------------------------------------------------------------
- profile_p = subparsers.add_parser('profile', help='Manage browser profiles (use -b real or -b remote)')
- profile_sub = profile_p.add_subparsers(dest='profile_command')
-
- # profile list - lists local or cloud profiles based on -b flag
- p = profile_sub.add_parser('list', help='List profiles (local with -b real, cloud with -b remote)')
- p.add_argument('--page', type=int, default=1, help='Page number (cloud only)')
- p.add_argument('--page-size', type=int, default=20, help='Items per page (cloud only)')
-
- # profile get
- p = profile_sub.add_parser('get', help='Get profile details')
- p.add_argument('id', help='Profile ID or name')
-
- # profile create (cloud only)
- p = profile_sub.add_parser('create', help='Create profile (cloud only)')
- p.add_argument('--name', help='Profile name')
-
- # profile update (cloud only)
- p = profile_sub.add_parser('update', help='Update profile (cloud only)')
- p.add_argument('id', help='Profile ID')
- p.add_argument('--name', required=True, help='New profile name')
-
- # profile delete (cloud only)
- p = profile_sub.add_parser('delete', help='Delete profile (cloud only)')
- p.add_argument('id', help='Profile ID')
-
- # profile cookies - list cookies by domain (local only)
- p = profile_sub.add_parser('cookies', help='List cookies by domain (local only, requires -b real)')
- p.add_argument('id', help='Profile ID or name (e.g. "Default", "Profile 1")')
-
- # profile sync - sync local profile to cloud
- p = profile_sub.add_parser('sync', help='Sync local Chrome profile to cloud')
- p.add_argument('--from', dest='from_profile', help='Local profile name (e.g. "Default", "Profile 1")')
- p.add_argument('--name', help='Cloud profile name (default: auto-generated)')
- p.add_argument('--domain', help='Only sync cookies for this domain (e.g. "youtube.com")')
+ profile_p = subparsers.add_parser('profile', help='Manage browser profiles (profile-use)')
+ profile_p.add_argument('profile_args', nargs=argparse.REMAINDER, help='profile-use arguments')
return parser
-def handle_server_command(args: argparse.Namespace) -> int:
- """Handle server subcommands."""
- if args.server_command == 'status':
- if is_server_running(args.session):
- print(f'Server for session "{args.session}" is running')
- return 0
- else:
- print(f'Server for session "{args.session}" is not running')
- return 1
+def _handle_cloud_connect(cloud_args: list[str], args: argparse.Namespace, session: str) -> int:
+ """Handle `browser-use cloud connect` — zero-config cloud browser provisioning."""
+ # Mutual exclusivity checks
+ if getattr(args, 'connect', False):
+ print('Error: --connect and cloud connect are mutually exclusive', file=sys.stderr)
+ return 1
+ if args.cdp_url:
+ print('Error: --cdp-url and cloud connect are mutually exclusive', file=sys.stderr)
+ return 1
+ if args.profile:
+ print('Error: --profile and cloud connect are mutually exclusive', file=sys.stderr)
+ return 1
- elif args.server_command == 'stop':
- if not is_server_running(args.session):
- print(f'Server for session "{args.session}" is not running')
- return 0
- response = send_command(args.session, 'shutdown', {})
+ # Validate API key exists before spawning daemon (shows our CLI error, not library's)
+ from browser_use.skill_cli.commands.cloud import (
+ _get_api_key,
+ _get_cloud_connect_proxy,
+ _get_cloud_connect_timeout,
+ _get_or_create_cloud_profile,
+ )
+
+ _get_api_key() # exits with helpful message if no key
+
+ cloud_profile_id = _get_or_create_cloud_profile()
+
+ # Start daemon with cloud config
+ if not args.json:
+ print('Connecting...', end='', flush=True)
+ ensure_daemon(
+ args.headed,
+ None,
+ session=session,
+ explicit_config=True,
+ use_cloud=True,
+ cloud_profile_id=cloud_profile_id,
+ cloud_proxy_country_code=_get_cloud_connect_proxy(),
+ cloud_timeout=_get_cloud_connect_timeout(),
+ )
+
+ # Send connect command to force immediate session creation
+ response = send_command('connect', {}, session=session)
+
+ if args.json:
+ print(json.dumps(response))
+ else:
+ print('\r' + ' ' * 20 + '\r', end='') # clear "Connecting..."
if response.get('success'):
- print(f'Server for session "{args.session}" stopped')
- return 0
+ data = response.get('data', {})
+ print(f'status: {data.get("status", "unknown")}')
+ if 'live_url' in data:
+ print(f'live_url: {data["live_url"]}')
+ if 'cdp_url' in data:
+ print(f'cdp_url: {data["cdp_url"]}')
else:
print(f'Error: {response.get("error")}', file=sys.stderr)
return 1
- elif args.server_command == 'logs':
- log_path = Path(tempfile.gettempdir()) / f'browser-use-{args.session}.log'
- if log_path.exists():
- print(log_path.read_text())
+ return 0
+
+
+def _handle_sessions(args: argparse.Namespace) -> int:
+ """List active daemon sessions."""
+ home_dir = _get_home_dir()
+ sessions: list[dict] = []
+
+ # Discover sessions from union of PID files + state files
+ session_names: set[str] = set()
+ for pid_file in home_dir.glob('*.pid'):
+ if pid_file.stem:
+ session_names.add(pid_file.stem)
+ for state_file in home_dir.glob('*.state.json'):
+ name = state_file.name.removesuffix('.state.json')
+ if name:
+ session_names.add(name)
+
+ for name in sorted(session_names):
+ probe = _probe_session(name)
+
+ if not probe.pid_alive:
+ # Don't delete if socket is still reachable — daemon alive despite stale PID
+ if not probe.socket_reachable:
+ _clean_session_files(name)
+ continue
+
+ # Terminal state + dead PID already handled above.
+ # If phase is terminal but PID is alive, the daemon restarted and
+ # the stale state file belongs to a previous instance — only clean
+ # the state file, not the PID/socket which the live daemon owns.
+ if probe.phase in ('stopped', 'failed'):
+ _get_state_path(name).unlink(missing_ok=True)
+ # Fall through to show the live session
+
+ entry: dict = {'name': name, 'pid': probe.pid or 0, 'phase': probe.phase or '?'}
+
+ # Try to ping for config info
+ if probe.socket_reachable:
+ try:
+ resp = send_command('ping', {}, session=name)
+ if resp.get('success'):
+ data = resp.get('data', {})
+ config_parts = []
+ if data.get('headed'):
+ config_parts.append('headed')
+ if data.get('profile'):
+ config_parts.append(f'profile={data["profile"]}')
+ if data.get('cdp_url'):
+ entry['cdp_url'] = data['cdp_url']
+ if not data.get('use_cloud'):
+ config_parts.append('cdp')
+ if data.get('use_cloud'):
+ config_parts.append('cloud')
+ entry['config'] = ', '.join(config_parts) if config_parts else 'headless'
+ except Exception:
+ entry['config'] = '?'
else:
- print('No logs found')
- return 0
+ entry['config'] = '?'
+
+ sessions.append(entry)
+
+ # Sweep orphaned sockets that have no corresponding live session
+ live_names = {s['name'] for s in sessions}
+ for sock_file in home_dir.glob('*.sock'):
+ if sock_file.stem not in live_names:
+ sock_file.unlink(missing_ok=True)
+
+ if args.json:
+ print(json.dumps({'sessions': sessions}))
+ else:
+ if sessions:
+ print(f'{"SESSION":<16} {"PHASE":<14} {"PID":<8} CONFIG')
+ for s in sessions:
+ print(f'{s["name"]:<16} {s.get("phase", "?"):<14} {s["pid"]:<8} {s.get("config", "")}')
+ else:
+ print('No active sessions')
return 0
-def _parse_key_value_list(items: list[str] | None) -> dict[str, str | None] | None:
- """Parse a list of 'key=value' strings into a dict."""
- if not items:
- return None
- result: dict[str, str | None] = {}
- for item in items:
- if '=' in item:
- key, value = item.split('=', 1)
- result[key] = value
- return result if result else None
+def _close_session(session: str) -> bool:
+ """Close a single session. Returns True if something was closed/killed.
+ Only cleans up files after the daemon process is confirmed dead.
+ """
+ probe = _probe_session(session)
-def _handle_remote_run_with_wait(args: argparse.Namespace) -> int:
- """Handle remote run with --wait directly (prints task info immediately, then waits)."""
- import asyncio
-
- from browser_use.skill_cli.commands import cloud_session, cloud_task
-
- if not args.task:
- print('Error: No task provided', file=sys.stderr)
- return 1
-
- try:
- # Handle vision flag (--vision vs --no-vision)
- vision: bool | None = None
- if getattr(args, 'vision', False):
- vision = True
- elif getattr(args, 'no_vision', False):
- vision = False
-
- # Parse key=value list params
- metadata = _parse_key_value_list(getattr(args, 'metadata', None))
- secrets = _parse_key_value_list(getattr(args, 'secret', None))
-
- # Build session params
- session_id = getattr(args, 'session_id', None)
- profile_id = getattr(args, 'profile', None)
- proxy_country = getattr(args, 'proxy_country', None)
-
- # Create session first if profile or proxy specified and no session_id
- if (profile_id or proxy_country) and not session_id:
- session = cloud_session.create_session(
- profile_id=profile_id,
- proxy_country=proxy_country,
- keep_alive=getattr(args, 'keep_alive', None),
- )
- session_id = session.id
-
- # Create task with all cloud-only flags
- task_response = cloud_task.create_task(
- task=args.task,
- llm=args.llm,
- session_id=session_id,
- max_steps=args.max_steps,
- flash_mode=getattr(args, 'flash', None),
- thinking=getattr(args, 'thinking', None),
- vision=vision,
- start_url=getattr(args, 'start_url', None),
- metadata=metadata,
- secrets=secrets,
- allowed_domains=getattr(args, 'allowed_domain', None),
- skill_ids=getattr(args, 'skill_id', None),
- structured_output=getattr(args, 'structured_output', None),
- judge=getattr(args, 'judge', None),
- judge_ground_truth=getattr(args, 'judge_ground_truth', None),
- )
-
- # Print initial info immediately
- print(f'mode: {args.browser}')
- print(f'task_id: {task_response.id}')
- print(f'session_id: {task_response.session_id}')
- print('waiting...', end='', flush=True)
-
- # Wait for completion
+ if probe.socket_reachable:
+ print('Closing...', end='', flush=True)
try:
- result = asyncio.run(cloud_task.poll_until_complete(task_response.id))
- except KeyboardInterrupt:
- print(f'\nInterrupted. Task {task_response.id} continues remotely.')
- return 0
+ send_command('shutdown', {}, session=session)
+ except Exception:
+ pass # Shutdown may have been accepted even if response failed
+ # Poll for PID disappearance (up to 15s: 10s browser cleanup + margin)
+ confirmed_dead = not probe.pid # No PID to check = assume success
+ if probe.pid:
+ for _ in range(150):
+ time.sleep(0.1)
+ if not _is_pid_alive(probe.pid):
+ confirmed_dead = True
+ break
+ if confirmed_dead:
+ _clean_session_files(session)
+ return True
- # Print final result
- print(' done.')
- print(f'status: {result.status}')
- print(f'output: {result.output}')
- if result.cost:
- print(f'cost: {result.cost}')
+ if probe.pid_alive and probe.pid and _is_daemon_process(probe.pid):
+ dead = _terminate_pid(probe.pid)
+ if dead:
+ _clean_session_files(session)
+ return dead
- return 0
+ # Nothing alive — clean up stale files if any exist
+ if probe.pid or probe.phase:
+ _clean_session_files(session)
+ return False
- except Exception as e:
- print(f'Error: {e}', file=sys.stderr)
- return 1
+
+def _handle_close_all(args: argparse.Namespace) -> int:
+ """Close all active sessions."""
+ home_dir = _get_home_dir()
+
+ # Discover sessions from union of PID files + state files
+ session_names: set[str] = set()
+ for pid_file in home_dir.glob('*.pid'):
+ if pid_file.stem:
+ session_names.add(pid_file.stem)
+ for state_file in home_dir.glob('*.state.json'):
+ name = state_file.name.removesuffix('.state.json')
+ if name:
+ session_names.add(name)
+
+ closed = 0
+ for name in sorted(session_names):
+ if _close_session(name):
+ closed += 1
+
+ if args.json:
+ print(json.dumps({'closed': closed}))
+ else:
+ if closed:
+ print(f'Closed {closed} session(s)')
+ else:
+ print('No active sessions')
+
+ return 0
+
+
+def _migrate_legacy_files() -> None:
+ """One-time cleanup of old daemon files and config migration."""
+ # Migrate config from old XDG location
+ from browser_use.skill_cli.utils import migrate_legacy_paths
+
+ migrate_legacy_paths()
+
+ # Clean up old single-socket daemon (pre-multi-session)
+ legacy_path = Path(tempfile.gettempdir()) / 'browser-use-cli.sock'
+ if sys.platform == 'win32':
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ try:
+ sock.settimeout(0.5)
+ sock.connect(('127.0.0.1', 49200))
+ req = json.dumps({'id': 'legacy', 'action': 'shutdown', 'params': {}}) + '\n'
+ sock.sendall(req.encode())
+ except OSError:
+ pass
+ finally:
+ sock.close()
+ elif legacy_path.exists():
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ try:
+ sock.settimeout(0.5)
+ sock.connect(str(legacy_path))
+ req = json.dumps({'id': 'legacy', 'action': 'shutdown', 'params': {}}) + '\n'
+ sock.sendall(req.encode())
+ except OSError:
+ legacy_path.unlink(missing_ok=True)
+ finally:
+ sock.close()
+
+ # Clean up old ~/.browser-use/run/ directory (stale PID/socket files)
+ old_run_dir = Path.home() / '.browser-use' / 'run'
+ if old_run_dir.is_dir():
+ for stale_file in old_run_dir.glob('browser-use-*'):
+ stale_file.unlink(missing_ok=True)
+ # Remove the directory if empty
+ try:
+ old_run_dir.rmdir()
+ except OSError:
+ pass
def main() -> int:
@@ -961,95 +1175,54 @@ def main() -> int:
parser.print_help()
return 0
- # Handle server subcommands without starting server
- if args.command == 'server':
- return handle_server_command(args)
+ # Resolve session name
+ session = args.session or os.environ.get('BROWSER_USE_SESSION', 'default')
+ if not re.match(r'^[a-zA-Z0-9_-]+$', session):
+ print(f'Error: Invalid session name {session!r}: only letters, digits, hyphens, underscores', file=sys.stderr)
+ return 1
- # Handle profile subcommands without starting server
- if args.command == 'profile':
- from browser_use.skill_cli.commands.profile import handle_profile_command
-
- return handle_profile_command(args)
-
- # Handle sessions list - find all running sessions
+ # Handle sessions command (before daemon interaction)
if args.command == 'sessions':
- from browser_use.skill_cli.utils import find_all_sessions
+ return _handle_sessions(args)
- session_names = find_all_sessions()
- sessions = [{'name': name, 'status': 'running'} for name in session_names]
+ # Handle cloud subcommands
+ if args.command == 'cloud':
+ cloud_args = getattr(args, 'cloud_args', [])
- if args.json:
- print(json.dumps(sessions))
- else:
- if sessions:
- for s in sessions:
- print(f' {s["name"]}: {s["status"]}')
- else:
- print('No active sessions')
- return 0
+ # Intercept 'cloud connect' — needs daemon, not REST passthrough
+ if cloud_args and cloud_args[0] == 'connect':
+ return _handle_cloud_connect(cloud_args[1:], args, session)
- # Handle close --all by closing all running sessions
- if args.command == 'close' and getattr(args, 'all', False):
- from browser_use.skill_cli.utils import find_all_sessions
+ # All other cloud subcommands are stateless REST passthroughs
+ from browser_use.skill_cli.commands.cloud import handle_cloud_command
- session_names = find_all_sessions()
- closed = []
- for name in session_names:
- try:
- response = send_command(name, 'close', {})
- if response.get('success'):
- closed.append(name)
- # Clean up metadata file
- meta_path = get_session_metadata_path(name)
- if meta_path.exists():
- meta_path.unlink()
- except Exception:
- pass # Server may already be stopping
+ return handle_cloud_command(cloud_args)
- if args.json:
- print(json.dumps({'closed': closed, 'count': len(closed)}))
- else:
- if closed:
- print(f'Closed {len(closed)} session(s): {", ".join(closed)}')
- else:
- print('No active sessions')
- return 0
+ # Handle profile subcommand — passthrough to profile-use Go binary
+ if args.command == 'profile':
+ from browser_use.skill_cli.profile_use import run_profile_use
+
+ profile_argv = getattr(args, 'profile_args', [])
+ return run_profile_use(profile_argv)
# Handle setup command
if args.command == 'setup':
from browser_use.skill_cli.commands import setup
- loop = asyncio.get_event_loop()
- result = loop.run_until_complete(
- setup.handle(
- 'setup',
- {
- 'mode': args.mode,
- 'api_key': args.api_key,
- 'yes': args.yes,
- 'json': args.json,
- },
- )
- )
+ result = setup.handle(yes=getattr(args, 'yes', False))
if args.json:
print(json.dumps(result))
elif 'error' in result:
print(f'Error: {result["error"]}', file=sys.stderr)
return 1
- else:
- if result.get('status') == 'success':
- print('\n✓ Setup complete!')
- print(f'\nMode: {result["mode"]}')
- print('Next: browser-use open https://example.com')
return 0
# Handle doctor command
if args.command == 'doctor':
from browser_use.skill_cli.commands import doctor
- loop = asyncio.get_event_loop()
- result = loop.run_until_complete(doctor.handle())
+ result = asyncio.run(doctor.handle())
if args.json:
print(json.dumps(result))
@@ -1083,19 +1256,73 @@ def main() -> int:
print('✓ All checks passed!')
else:
print(f'⚠ {result.get("summary", "Some checks need attention")}')
+
+ # Show config state
+ from browser_use.skill_cli.config import CLI_DOCS_URL, get_config_display
+
+ entries = get_config_display()
+ print(f'\nConfig ({_get_home_dir() / "config.json"}):\n')
+ for entry in entries:
+ if entry['is_set']:
+ icon = '✓'
+ val = 'set' if entry['sensitive'] else entry['value']
+ else:
+ icon = '○'
+ val = entry['value'] if entry['value'] else 'not set'
+ print(f' {icon} {entry["key"]}: {val}')
+ print(f' Docs: {CLI_DOCS_URL}')
+
return 0
- # Handle task command - cloud task management
- if args.command == 'task':
- from browser_use.skill_cli.commands.cloud_task import handle_task_command
+ # Handle config command
+ if args.command == 'config':
+ from browser_use.skill_cli.config import (
+ CLI_DOCS_URL,
+ get_config_display,
+ get_config_value,
+ set_config_value,
+ unset_config_value,
+ )
- return handle_task_command(args)
+ config_cmd = getattr(args, 'config_command', None)
- # Handle session command - cloud session management
- if args.command == 'session':
- from browser_use.skill_cli.commands.cloud_session import handle_session_command
+ if config_cmd == 'set':
+ try:
+ set_config_value(args.key, args.value)
+ print(f'{args.key} = {args.value}')
+ except ValueError as e:
+ print(f'Error: {e}', file=sys.stderr)
+ return 1
- return handle_session_command(args)
+ elif config_cmd == 'get':
+ val = get_config_value(args.key)
+ if val is not None:
+ print(val)
+ else:
+ print(f'{args.key}: not set', file=sys.stderr)
+
+ elif config_cmd == 'unset':
+ try:
+ unset_config_value(args.key)
+ print(f'{args.key} removed')
+ except ValueError as e:
+ print(f'Error: {e}', file=sys.stderr)
+ return 1
+
+ elif config_cmd == 'list' or config_cmd is None:
+ entries = get_config_display()
+ print(f'Config ({_get_home_dir() / "config.json"}):')
+ for entry in entries:
+ if entry['is_set']:
+ icon = '✓'
+ val = 'set' if entry['sensitive'] else entry['value']
+ else:
+ icon = '○'
+ val = entry['value'] if entry['value'] else 'not set'
+ print(f' {icon} {entry["key"]}: {val}')
+ print(f' Docs: {CLI_DOCS_URL}')
+
+ return 0
# Handle tunnel command - runs independently of browser session
if args.command == 'tunnel':
@@ -1109,9 +1336,9 @@ def main() -> int:
port_arg = getattr(args, 'port_arg', None)
if getattr(args, 'all', False):
# stop --all
- result = asyncio.get_event_loop().run_until_complete(tunnel.stop_all_tunnels())
+ result = asyncio.run(tunnel.stop_all_tunnels())
elif port_arg is not None:
- result = asyncio.get_event_loop().run_until_complete(tunnel.stop_tunnel(port_arg))
+ result = asyncio.run(tunnel.stop_tunnel(port_arg))
else:
print('Usage: browser-use tunnel stop | --all', file=sys.stderr)
return 1
@@ -1121,7 +1348,7 @@ def main() -> int:
except ValueError:
print(f'Unknown tunnel subcommand: {pos}', file=sys.stderr)
return 1
- result = asyncio.get_event_loop().run_until_complete(tunnel.start_tunnel(port))
+ result = asyncio.run(tunnel.start_tunnel(port))
else:
print('Usage: browser-use tunnel | list | stop ', file=sys.stderr)
return 0
@@ -1152,80 +1379,95 @@ def main() -> int:
print(f'Stopped tunnel on port {result["stopped"]}')
return 0
- # Validate requested mode is available based on installation config
- from browser_use.skill_cli.install_config import get_mode_unavailable_error, is_mode_available
+ # Handle close — shutdown daemon
+ if args.command == 'close':
+ if getattr(args, 'all', False):
+ return _handle_close_all(args)
- if not is_mode_available(args.browser):
- print(get_mode_unavailable_error(args.browser), file=sys.stderr)
+ closed = _close_session(session)
+ if args.json:
+ print(json.dumps({'success': True, 'data': {'shutdown': True}}))
+ else:
+ print('\r' + ' ' * 20 + '\r', end='') # clear "Closing..."
+ if closed:
+ print('Browser closed')
+ elif closed is False and _probe_session(session).pid_alive:
+ print('Warning: daemon may still be shutting down', file=sys.stderr)
+ else:
+ print('No active browser session')
+ return 0
+
+ # Handle --connect deprecation
+ if args.connect:
+ print('Note: --connect has been replaced.', file=sys.stderr)
+ print(' To connect to Chrome: browser-use connect', file=sys.stderr)
+ print(' For cloud browser: browser-use cloud connect', file=sys.stderr)
+ print(' For multiple agents: use --session NAME per agent', file=sys.stderr)
return 1
- # Set API key in environment if provided
- if args.api_key:
- os.environ['BROWSER_USE_API_KEY'] = args.api_key
-
- # Validate API key for remote browser mode upfront
- if args.browser == 'remote':
- from browser_use.skill_cli.api_key import APIKeyRequired, require_api_key
+ # Handle connect command (discover local Chrome, start daemon)
+ if args.command == 'connect':
+ from browser_use.skill_cli.utils import discover_chrome_cdp_url
try:
- api_key = require_api_key('Remote browser')
- # Ensure it's in environment for the cloud client
- os.environ['BROWSER_USE_API_KEY'] = api_key
- except APIKeyRequired as e:
+ cdp_url = discover_chrome_cdp_url()
+ except RuntimeError as e:
print(f'Error: {e}', file=sys.stderr)
return 1
- # Validate --profile flag usage
- if args.profile and args.browser == 'chromium':
- print(
- 'Error: --profile is not supported in chromium mode.\n'
- 'Use -b real for local Chrome profiles or -b remote for cloud profiles.',
- file=sys.stderr,
- )
+ ensure_daemon(args.headed, None, cdp_url=cdp_url, session=session, explicit_config=True)
+ response = send_command('connect', {}, session=session)
+
+ if args.json:
+ print(json.dumps(response))
+ else:
+ if response.get('success'):
+ data = response.get('data', {})
+ print(f'status: {data.get("status", "unknown")}')
+ if 'cdp_url' in data:
+ print(f'cdp_url: {data["cdp_url"]}')
+ else:
+ print(f'Error: {response.get("error")}', file=sys.stderr)
+ return 1
+ return 0
+
+ # Mutual exclusivity
+ if args.cdp_url and args.profile:
+ print('Error: --cdp-url and --profile are mutually exclusive', file=sys.stderr)
return 1
- # Handle remote run with --wait directly (prints task_id immediately, then waits)
- if args.browser == 'remote' and args.command == 'run' and getattr(args, 'wait', False):
- return _handle_remote_run_with_wait(args)
+ # One-time legacy migration
+ _migrate_legacy_files()
- # Ensure server is running
- ensure_server(args.session, args.browser, args.headed, args.profile, args.api_key)
+ # Ensure daemon is running
+ explicit_config = any(flag in sys.argv for flag in ('--headed', '--profile', '--cdp-url'))
+ ensure_daemon(args.headed, args.profile, args.cdp_url, session=session, explicit_config=explicit_config)
# Build params from args
params = {}
- skip_keys = {'command', 'session', 'browser', 'headed', 'json', 'api_key', 'server_command'}
+ skip_keys = {'command', 'headed', 'json', 'cdp_url', 'session', 'connect'}
for key, value in vars(args).items():
if key not in skip_keys and value is not None:
params[key] = value
- # Add profile to params for commands that need it (agent tasks, etc.)
- # Note: profile is passed to ensure_server for local browser profile,
- # but also needs to be in params for cloud profile ID in remote mode
+ # Resolve file paths to absolute before sending to daemon (daemon may have different CWD)
+ if args.command == 'upload' and 'path' in params:
+ params['path'] = str(Path(params['path']).expanduser().resolve())
+
+ # Add profile to params for commands that need it
if args.profile:
params['profile'] = args.profile
- # Send command to server
- response = send_command(args.session, args.command, params)
-
- # Clean up metadata file on successful close
- if args.command == 'close' and response.get('success'):
- meta_path = get_session_metadata_path(args.session)
- if meta_path.exists():
- meta_path.unlink()
+ # Send command to daemon
+ response = send_command(args.command, params, session=session)
# Output response
if args.json:
- # Add mode to JSON output for browser-related commands
- if args.command in ('open', 'run', 'state', 'click', 'type', 'input', 'scroll', 'screenshot'):
- response['mode'] = args.browser
print(json.dumps(response))
else:
if response.get('success'):
data = response.get('data')
- # Show mode for browser-related commands (first line of output)
- if args.command in ('open', 'run'):
- print(f'mode: {args.browser}')
if data is not None:
if isinstance(data, dict):
# Special case: raw text output (e.g., state command)
diff --git a/browser_use/skill_cli/profile_use.py b/browser_use/skill_cli/profile_use.py
new file mode 100644
index 000000000..a5fd94d60
--- /dev/null
+++ b/browser_use/skill_cli/profile_use.py
@@ -0,0 +1,104 @@
+"""Profile-use Go binary management.
+
+Downloads, locates, and invokes the profile-use Go binary as a managed
+subcommand of `browser-use profile`. The binary is always managed at
+~/.browser-use/bin/profile-use — standalone installs on $PATH are independent.
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+
+def get_profile_use_binary() -> Path | None:
+ """Return path to managed profile-use binary, or None if not installed."""
+ from browser_use.skill_cli.utils import get_bin_dir
+
+ binary = get_bin_dir() / ('profile-use.exe' if sys.platform == 'win32' else 'profile-use')
+ if binary.is_file() and os.access(str(binary), os.X_OK):
+ return binary
+ return None
+
+
+def download_profile_use() -> Path:
+ """Download profile-use binary via the official install script.
+
+ Runs: curl -fsSL https://browser-use.com/profile/cli/install.sh | sh
+ with INSTALL_DIR set to ~/.browser-use/bin/
+
+ Raises RuntimeError if download fails.
+ """
+ from browser_use.skill_cli.utils import get_bin_dir
+
+ if not shutil.which('curl'):
+ raise RuntimeError(
+ 'curl is required to download profile-use.\n'
+ 'Install curl and try again, or install profile-use manually:\n'
+ ' curl -fsSL https://browser-use.com/profile/cli/install.sh | sh'
+ )
+
+ bin_dir = get_bin_dir()
+ env = {**os.environ, 'INSTALL_DIR': str(bin_dir)}
+
+ result = subprocess.run(
+ ['sh', '-c', 'curl -fsSL https://browser-use.com/profile/cli/install.sh | sh'],
+ env=env,
+ )
+
+ if result.returncode != 0:
+ raise RuntimeError(
+ 'Failed to download profile-use. Try installing manually:\n curl -fsSL https://browser-use.com/profile/cli/install.sh | sh'
+ )
+
+ binary = get_profile_use_binary()
+ if binary is None:
+ raise RuntimeError('Download appeared to succeed but binary not found at expected location.')
+
+ return binary
+
+
+def ensure_profile_use() -> Path:
+ """Return path to profile-use binary, downloading if not present."""
+ binary = get_profile_use_binary()
+ if binary is not None:
+ return binary
+
+ print('profile-use not found, downloading...', file=sys.stderr)
+ return download_profile_use()
+
+
+def run_profile_use(args: list[str]) -> int:
+ """Execute profile-use with the given arguments.
+
+ Handles the 'update' subcommand specially by re-running the install script.
+ Passes BROWSER_USE_CONFIG_DIR so profile-use shares config with browser-use.
+ """
+ # Handle 'update' subcommand — re-download latest binary
+ if args and args[0] == 'update':
+ try:
+ download_profile_use()
+ print('profile-use updated successfully')
+ return 0
+ except RuntimeError as e:
+ print(f'Error: {e}', file=sys.stderr)
+ return 1
+
+ try:
+ binary = ensure_profile_use()
+ except RuntimeError as e:
+ print(f'Error: {e}', file=sys.stderr)
+ return 1
+
+ from browser_use.skill_cli.utils import get_home_dir
+
+ env = {**os.environ, 'BROWSER_USE_CONFIG_DIR': str(get_home_dir())}
+ # Forward API key from config.json for profile-use binary
+ from browser_use.skill_cli.config import get_config_value
+
+ api_key = get_config_value('api_key')
+ if api_key:
+ env['BROWSER_USE_API_KEY'] = str(api_key)
+
+ return subprocess.call([str(binary)] + args, env=env)
diff --git a/browser_use/skill_cli/protocol.py b/browser_use/skill_cli/protocol.py
deleted file mode 100644
index 9be7964d0..000000000
--- a/browser_use/skill_cli/protocol.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Wire protocol for CLI↔Server communication.
-
-Uses JSON over Unix sockets (or TCP on Windows) with newline-delimited messages.
-"""
-
-import json
-from dataclasses import asdict, dataclass, field
-from typing import Any
-
-
-@dataclass
-class Request:
- """Command request from CLI to server."""
-
- id: str
- action: str
- session: str
- params: dict[str, Any] = field(default_factory=dict)
-
- def to_json(self) -> str:
- return json.dumps(asdict(self))
-
- @classmethod
- def from_json(cls, data: str) -> 'Request':
- d = json.loads(data)
- return cls(
- id=d['id'],
- action=d['action'],
- session=d['session'],
- params=d.get('params', {}),
- )
-
-
-@dataclass
-class Response:
- """Response from server to CLI."""
-
- id: str
- success: bool
- data: Any = None
- error: str | None = None
-
- def to_json(self) -> str:
- return json.dumps(asdict(self))
-
- @classmethod
- def from_json(cls, data: str) -> 'Response':
- d = json.loads(data)
- return cls(
- id=d['id'],
- success=d['success'],
- data=d.get('data'),
- error=d.get('error'),
- )
diff --git a/browser_use/skill_cli/python_session.py b/browser_use/skill_cli/python_session.py
index 883a11f82..e96cc0b96 100644
--- a/browser_use/skill_cli/python_session.py
+++ b/browser_use/skill_cli/python_session.py
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, Literal
if TYPE_CHECKING:
from browser_use.browser.session import BrowserSession
+ from browser_use.skill_cli.actions import ActionHandler
@dataclass
@@ -48,7 +49,11 @@ class PythonSession:
)
def execute(
- self, code: str, browser_session: 'BrowserSession', loop: asyncio.AbstractEventLoop | None = None
+ self,
+ code: str,
+ browser_session: 'BrowserSession',
+ loop: asyncio.AbstractEventLoop | None = None,
+ actions: 'ActionHandler | None' = None,
) -> ExecutionResult:
"""Execute code in persistent namespace.
@@ -59,10 +64,11 @@ class PythonSession:
code: Python code to execute
browser_session: The browser session for browser operations
loop: The event loop for async operations (required for browser access)
+ actions: Optional ActionHandler for direct execution (no event bus)
"""
# Inject browser wrapper with the event loop for async operations
- if loop is not None:
- self.namespace['browser'] = BrowserWrapper(browser_session, loop)
+ if loop is not None and actions is not None:
+ self.namespace['browser'] = BrowserWrapper(browser_session, loop, actions)
self.execution_count += 1
stdout = io.StringIO()
@@ -115,9 +121,10 @@ class BrowserWrapper:
Runs coroutines on the server's event loop using run_coroutine_threadsafe.
"""
- def __init__(self, session: 'BrowserSession', loop: asyncio.AbstractEventLoop) -> None:
+ def __init__(self, session: 'BrowserSession', loop: asyncio.AbstractEventLoop, actions: 'ActionHandler') -> None:
self._session = session
self._loop = loop
+ self._actions = actions
def _run(self, coro: Any) -> Any:
"""Run coroutine on the server's event loop."""
@@ -147,21 +154,17 @@ class BrowserWrapper:
self._run(self._goto_async(url))
async def _goto_async(self, url: str) -> None:
- from browser_use.browser.events import NavigateToUrlEvent
-
- await self._session.event_bus.dispatch(NavigateToUrlEvent(url=url))
+ await self._actions.navigate(url)
def click(self, index: int) -> None:
"""Click element by index."""
self._run(self._click_async(index))
async def _click_async(self, index: int) -> None:
- from browser_use.browser.events import ClickElementEvent
-
node = await self._session.get_element_by_index(index)
if node is None:
raise ValueError(f'Element index {index} not found')
- await self._session.event_bus.dispatch(ClickElementEvent(node=node))
+ await self._actions.click_element(node)
def type(self, text: str) -> None:
"""Type text into focused element."""
@@ -181,22 +184,44 @@ class BrowserWrapper:
self._run(self._input_async(index, text))
async def _input_async(self, index: int, text: str) -> None:
- from browser_use.browser.events import ClickElementEvent, TypeTextEvent
+ node = await self._session.get_element_by_index(index)
+ if node is None:
+ raise ValueError(f'Element index {index} not found')
+ await self._actions.click_element(node)
+ await self._actions.type_text(node, text)
+
+ def upload(self, index: int, path: str) -> None:
+ """Upload a file to a file input element."""
+ self._run(self._upload_async(index, path))
+
+ async def _upload_async(self, index: int, path: str) -> None:
+ from pathlib import Path as P
+
+ file_path = str(P(path).expanduser().resolve())
+ p = P(file_path)
+ if not p.exists():
+ raise FileNotFoundError(f'File not found: {file_path}')
+ if not p.is_file():
+ raise ValueError(f'Not a file: {file_path}')
+ if p.stat().st_size == 0:
+ raise ValueError(f'File is empty (0 bytes): {file_path}')
node = await self._session.get_element_by_index(index)
if node is None:
raise ValueError(f'Element index {index} not found')
- await self._session.event_bus.dispatch(ClickElementEvent(node=node))
- await self._session.event_bus.dispatch(TypeTextEvent(node=node, text=text))
+
+ file_input_node = self._session.find_file_input_near_element(node)
+ if file_input_node is None:
+ raise ValueError(f'Element {index} is not a file input and no file input found nearby')
+
+ await self._actions.upload_file(file_input_node, file_path)
def scroll(self, direction: Literal['up', 'down', 'left', 'right'] = 'down', amount: int = 500) -> None:
"""Scroll the page."""
self._run(self._scroll_async(direction, amount))
async def _scroll_async(self, direction: Literal['up', 'down', 'left', 'right'], amount: int) -> None:
- from browser_use.browser.events import ScrollEvent
-
- await self._session.event_bus.dispatch(ScrollEvent(direction=direction, amount=amount))
+ await self._actions.scroll(direction, amount)
def screenshot(self, path: str | None = None) -> bytes:
"""Take screenshot, optionally save to file."""
@@ -233,18 +258,14 @@ class BrowserWrapper:
self._run(self._keys_async(keys))
async def _keys_async(self, keys: str) -> None:
- from browser_use.browser.events import SendKeysEvent
-
- await self._session.event_bus.dispatch(SendKeysEvent(keys=keys))
+ await self._actions.send_keys(keys)
def back(self) -> None:
"""Go back in history."""
self._run(self._back_async())
async def _back_async(self) -> None:
- from browser_use.browser.events import GoBackEvent
-
- await self._session.event_bus.dispatch(GoBackEvent())
+ await self._actions.go_back()
def wait(self, seconds: float) -> None:
"""Wait for specified seconds."""
diff --git a/browser_use/skill_cli/requirements-cli.txt b/browser_use/skill_cli/requirements-cli.txt
new file mode 100644
index 000000000..fe682e72b
--- /dev/null
+++ b/browser_use/skill_cli/requirements-cli.txt
@@ -0,0 +1,12 @@
+# Minimal dependencies for the browser-use CLI.
+# Used by install_lite.sh — update this file if the CLI's import chain changes.
+aiohttp==3.13.4
+bubus==1.5.6
+cdp-use==1.4.5
+httpx==0.28.1
+psutil==7.2.2
+pydantic==2.12.5
+pydantic-settings==2.12.0
+python-dotenv==1.2.1
+typing-extensions==4.15.0
+uuid7==0.1.0
diff --git a/browser_use/skill_cli/server.py b/browser_use/skill_cli/server.py
deleted file mode 100644
index 67b8dace5..000000000
--- a/browser_use/skill_cli/server.py
+++ /dev/null
@@ -1,292 +0,0 @@
-"""Session server - keeps BrowserSession instances alive.
-
-This server runs as a background process, managing browser sessions and
-handling commands from the CLI. It uses Unix sockets (or TCP on Windows)
-for IPC communication.
-"""
-
-import argparse
-import asyncio
-import json
-import logging
-import os
-import signal
-import sys
-from pathlib import Path
-from typing import IO
-
-import portalocker
-
-# Configure logging before imports
-logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
- handlers=[logging.StreamHandler()],
-)
-logger = logging.getLogger('browser_use.skill_cli.server')
-
-
-class SessionServer:
- """Server that manages browser sessions and handles CLI commands."""
-
- def __init__(
- self,
- session_name: str,
- browser_mode: str,
- headed: bool,
- profile: str | None,
- ) -> None:
- self.session_name = session_name
- self.browser_mode = browser_mode
- self.headed = headed
- self.profile = profile
- self.running = True
- self._server: asyncio.Server | None = None
- self._shutdown_event: asyncio.Event | None = None
- self._lock_file: IO | None = None
-
- # Lazy import to avoid loading everything at startup
- from browser_use.skill_cli.sessions import SessionRegistry
-
- self.registry = SessionRegistry()
-
- async def handle_connection(
- self,
- reader: asyncio.StreamReader,
- writer: asyncio.StreamWriter,
- ) -> None:
- """Handle a client connection."""
- addr = writer.get_extra_info('peername')
- logger.debug(f'Connection from {addr}')
-
- try:
- while self.running:
- try:
- line = await asyncio.wait_for(reader.readline(), timeout=300) # 5 min timeout
- except TimeoutError:
- logger.debug(f'Connection timeout from {addr}')
- break
-
- if not line:
- break
-
- request = {}
- try:
- request = json.loads(line.decode())
- response = await self.dispatch(request)
- except json.JSONDecodeError as e:
- response = {'id': '', 'success': False, 'error': f'Invalid JSON: {e}'}
- except Exception as e:
- logger.exception(f'Error handling request: {e}')
- response = {'id': '', 'success': False, 'error': str(e)}
-
- writer.write((json.dumps(response) + '\n').encode())
- await writer.drain()
-
- # Check for shutdown command
- if request.get('action') == 'shutdown':
- await self.shutdown()
- break
-
- except Exception as e:
- logger.exception(f'Connection error: {e}')
- finally:
- writer.close()
- try:
- await writer.wait_closed()
- except Exception:
- pass
-
- async def dispatch(self, request: dict) -> dict:
- """Dispatch command to appropriate handler."""
- action = request.get('action', '')
- params = request.get('params', {})
- req_id = request.get('id', '')
-
- logger.info(f'Dispatch: {action} (id={req_id})')
-
- try:
- # Import command handlers
- from browser_use.skill_cli.commands import agent, browser, python_exec, session
-
- # Handle shutdown
- if action == 'shutdown':
- return {'id': req_id, 'success': True, 'data': {'shutdown': True}}
-
- # Session commands don't need a browser session
- if action in session.COMMANDS:
- result = await session.handle(action, self.session_name, self.registry, params)
- # Check if command wants to shutdown server
- if result.get('_shutdown'):
- asyncio.create_task(self.shutdown())
- return {'id': req_id, 'success': True, 'data': result}
-
- # Get or create session for browser commands
- session_info = await self.registry.get_or_create(
- self.session_name,
- self.browser_mode,
- self.headed,
- self.profile,
- )
-
- # Dispatch to handler
- if action in browser.COMMANDS:
- result = await browser.handle(action, session_info, params)
- elif action == 'python':
- result = await python_exec.handle(session_info, params)
- elif action == 'run':
- result = await agent.handle(session_info, params)
- else:
- return {'id': req_id, 'success': False, 'error': f'Unknown action: {action}'}
-
- return {'id': req_id, 'success': True, 'data': result}
-
- except Exception as e:
- logger.exception(f'Error dispatching {action}: {e}')
- return {'id': req_id, 'success': False, 'error': str(e)}
-
- async def shutdown(self) -> None:
- """Graceful shutdown."""
- logger.info('Shutting down server...')
- self.running = False
-
- # Signal the shutdown event
- if self._shutdown_event:
- self._shutdown_event.set()
-
- # Close all sessions
- await self.registry.close_all()
-
- # Stop the server
- if self._server:
- self._server.close()
- await self._server.wait_closed()
-
- # Clean up files
- from browser_use.skill_cli.utils import cleanup_session_files
-
- cleanup_session_files(self.session_name)
-
- async def run(self) -> None:
- """Run the server."""
- from browser_use.skill_cli.utils import get_lock_path, get_pid_path, get_socket_path
-
- # Acquire exclusive lock BEFORE writing PID - this prevents race conditions
- lock_path = get_lock_path(self.session_name)
- lock_path.parent.mkdir(parents=True, exist_ok=True)
- lock_path.touch(exist_ok=True)
-
- self._lock_file = open(lock_path, 'r+') # noqa: ASYNC230 - blocking ok at startup
- try:
- portalocker.lock(self._lock_file, portalocker.LOCK_EX | portalocker.LOCK_NB)
- except portalocker.LockException:
- logger.error(f'Another server is already running for session: {self.session_name}')
- self._lock_file.close()
- self._lock_file = None
- sys.exit(1)
-
- logger.info(f'Acquired exclusive lock for session: {self.session_name}')
-
- # NOW safe to write PID file
- pid_path = get_pid_path(self.session_name)
- pid_path.write_text(str(os.getpid()))
- logger.info(f'PID file: {pid_path}')
-
- # Setup signal handlers
- loop = asyncio.get_running_loop()
-
- def signal_handler():
- asyncio.create_task(self.shutdown())
-
- for sig in (signal.SIGINT, signal.SIGTERM):
- try:
- loop.add_signal_handler(sig, signal_handler)
- except NotImplementedError:
- # Windows doesn't support add_signal_handler
- pass
-
- # Also handle SIGHUP on Unix
- if hasattr(signal, 'SIGHUP'):
- try:
- loop.add_signal_handler(signal.SIGHUP, signal_handler)
- except NotImplementedError:
- pass
-
- # Get socket path
- sock_path = get_socket_path(self.session_name)
- logger.info(f'Socket: {sock_path}')
-
- # Start server
- if sock_path.startswith('tcp://'):
- # Windows: TCP server
- _, hostport = sock_path.split('://', 1)
- host, port = hostport.split(':')
- self._server = await asyncio.start_server(
- self.handle_connection,
- host,
- int(port),
- reuse_address=True, # Allow rebinding ports in TIME_WAIT state
- )
- logger.info(f'Listening on TCP {host}:{port}')
- else:
- # Unix: socket server
- # Remove stale socket file
- sock_file = Path(sock_path)
- if sock_file.exists():
- sock_file.unlink()
-
- self._server = await asyncio.start_unix_server(
- self.handle_connection,
- sock_path,
- )
- logger.info(f'Listening on Unix socket {sock_path}')
-
- # Run until shutdown
- self._shutdown_event = asyncio.Event()
- try:
- async with self._server:
- await self._shutdown_event.wait()
- except asyncio.CancelledError:
- pass
- finally:
- # Release lock on shutdown
- if self._lock_file:
- try:
- portalocker.unlock(self._lock_file)
- self._lock_file.close()
- except Exception:
- pass
- self._lock_file = None
- logger.info('Server stopped')
-
-
-def main() -> None:
- """Main entry point for server process."""
- parser = argparse.ArgumentParser(description='Browser-use session server')
- parser.add_argument('--session', required=True, help='Session name')
- parser.add_argument('--browser', default='chromium', choices=['chromium', 'real', 'remote'])
- parser.add_argument('--headed', action='store_true', help='Show browser window')
- parser.add_argument('--profile', help='Chrome profile (real browser mode)')
- args = parser.parse_args()
-
- logger.info(f'Starting server for session: {args.session}')
- logger.info(f'Browser mode: {args.browser}, headed: {args.headed}')
-
- server = SessionServer(
- session_name=args.session,
- browser_mode=args.browser,
- headed=args.headed,
- profile=args.profile,
- )
-
- try:
- asyncio.run(server.run())
- except KeyboardInterrupt:
- logger.info('Interrupted')
- except Exception as e:
- logger.exception(f'Server error: {e}')
- sys.exit(1)
-
-
-if __name__ == '__main__':
- main()
diff --git a/browser_use/skill_cli/sessions.py b/browser_use/skill_cli/sessions.py
index f8baf2ff1..c41020980 100644
--- a/browser_use/skill_cli/sessions.py
+++ b/browser_use/skill_cli/sessions.py
@@ -1,12 +1,18 @@
-"""Session registry - manages BrowserSession instances."""
+"""Session data — SessionInfo dataclass and browser session factory."""
+
+from __future__ import annotations
import logging
from dataclasses import dataclass, field
-from typing import Any
+from typing import TYPE_CHECKING
-from browser_use.browser.session import BrowserSession
+from browser_use.skill_cli.browser import CLIBrowserSession
from browser_use.skill_cli.python_session import PythonSession
+if TYPE_CHECKING:
+ from browser_use.browser.session import BrowserSession
+ from browser_use.skill_cli.actions import ActionHandler
+
logger = logging.getLogger(__name__)
@@ -15,142 +21,88 @@ class SessionInfo:
"""Information about a browser session."""
name: str
- browser_mode: str
headed: bool
profile: str | None
+ cdp_url: str | None
browser_session: BrowserSession
+ actions: ActionHandler | None = None
python_session: PythonSession = field(default_factory=PythonSession)
-
-
-class SessionRegistry:
- """Registry of active browser sessions.
-
- Sessions are created on-demand when first accessed. Each named session
- is isolated with its own BrowserSession and Python namespace.
- """
-
- def __init__(self) -> None:
- self._sessions: dict[str, SessionInfo] = {}
-
- async def get_or_create(
- self,
- name: str,
- browser_mode: str,
- headed: bool,
- profile: str | None,
- ) -> SessionInfo:
- """Get existing session or create new one."""
- if name in self._sessions:
- return self._sessions[name]
-
- logger.info(f'Creating new session: {name} (mode={browser_mode}, headed={headed})')
-
- browser_session = await create_browser_session(browser_mode, headed, profile)
- await browser_session.start()
-
- session_info = SessionInfo(
- name=name,
- browser_mode=browser_mode,
- headed=headed,
- profile=profile,
- browser_session=browser_session,
- )
- self._sessions[name] = session_info
- return session_info
-
- def get(self, name: str) -> SessionInfo | None:
- """Get session by name."""
- return self._sessions.get(name)
-
- def list_sessions(self) -> list[dict[str, Any]]:
- """List all active sessions."""
- return [
- {
- 'name': s.name,
- 'browser_mode': s.browser_mode,
- 'headed': s.headed,
- 'profile': s.profile,
- }
- for s in self._sessions.values()
- ]
-
- async def close_session(self, name: str) -> bool:
- """Close and remove a session."""
- if name not in self._sessions:
- return False
-
- session = self._sessions.pop(name)
- logger.info(f'Closing session: {name}')
-
- # Note: Tunnels are managed independently via tunnel.py
- # They persist across session close/open cycles
-
- try:
- await session.browser_session.kill()
- except Exception as e:
- logger.warning(f'Error closing session {name}: {e}')
- return True
-
- async def close_all(self) -> None:
- """Close all sessions."""
- for name in list(self._sessions.keys()):
- await self.close_session(name)
+ use_cloud: bool = False
async def create_browser_session(
- mode: str,
headed: bool,
profile: str | None,
-) -> BrowserSession:
- """Create BrowserSession based on mode.
+ cdp_url: str | None = None,
+ use_cloud: bool = False,
+ cloud_profile_id: str | None = None,
+ cloud_proxy_country_code: str | None = None,
+ cloud_timeout: int | None = None,
+) -> CLIBrowserSession:
+ """Create BrowserSession based on connection mode.
- Modes:
- - chromium: Playwright-managed Chromium (default)
- - real: User's Chrome with profile
- - remote: Browser-Use Cloud (requires API key)
-
- Raises:
- RuntimeError: If the requested mode is not available based on installation config
+ - CDP URL: Connect to existing browser (cdp_url takes precedence)
+ - Cloud: Provision a cloud browser via BrowserSession(use_cloud=True)
+ - With profile: User's real Chrome with the specified profile
+ - No profile: Playwright-managed Chromium (default)
"""
- from browser_use.skill_cli.install_config import get_mode_unavailable_error, is_mode_available
+ if cdp_url is not None:
+ return CLIBrowserSession(cdp_url=cdp_url) # type: ignore[call-arg]
- # Validate mode is available based on installation config
- if not is_mode_available(mode):
- raise RuntimeError(get_mode_unavailable_error(mode))
+ if use_cloud:
+ kwargs: dict = {'use_cloud': True}
+ if cloud_profile_id is not None:
+ kwargs['cloud_profile_id'] = cloud_profile_id
+ if cloud_proxy_country_code is not None:
+ kwargs['cloud_proxy_country_code'] = cloud_proxy_country_code
+ if cloud_timeout is not None:
+ kwargs['cloud_timeout'] = cloud_timeout
+ return CLIBrowserSession(**kwargs) # type: ignore[call-arg]
- if mode == 'chromium':
- return BrowserSession(
- headless=not headed,
- )
+ if profile is None:
+ return CLIBrowserSession(headless=not headed) # type: ignore[call-arg]
- elif mode == 'real':
- from browser_use.skill_cli.utils import find_chrome_executable, get_chrome_profile_path
+ from browser_use.skill_cli.utils import find_chrome_executable, get_chrome_profile_path, list_chrome_profiles
- chrome_path = find_chrome_executable()
- if not chrome_path:
- raise RuntimeError('Could not find Chrome executable. Please install Chrome or specify --browser chromium')
+ chrome_path = find_chrome_executable()
+ if not chrome_path:
+ raise RuntimeError('Could not find Chrome executable. Please install Chrome or omit --profile to use Chromium.')
- # Always get the Chrome user data directory (not the profile subdirectory)
- user_data_dir = get_chrome_profile_path(None)
- # Profile directory defaults to 'Default', or use the specified profile name
- profile_directory = profile if profile else 'Default'
+ # Always get the Chrome user data directory (not the profile subdirectory)
+ user_data_dir = get_chrome_profile_path(None)
- return BrowserSession(
- executable_path=chrome_path,
- user_data_dir=user_data_dir,
- profile_directory=profile_directory,
- headless=not headed, # Headless by default, --headed for visible
- )
-
- elif mode == 'remote':
- from browser_use.skill_cli.api_key import require_api_key
-
- require_api_key('Remote browser')
- # Profile is used as cloud_profile_id for remote mode
- return BrowserSession(
- use_cloud=True,
- cloud_profile_id=profile,
- )
+ # Resolve profile: accept directory names ("Default", "Profile 1") and
+ # display names ("Person 1", "Work"). Directory names take precedence.
+ # If profile metadata can't be read, fall back to using the value as-is.
+ known_profiles = list_chrome_profiles()
+ directory_names = {p['directory'] for p in known_profiles}
+ if not known_profiles or profile in directory_names:
+ profile_directory = profile
else:
- raise ValueError(f'Unknown browser mode: {mode}')
+ # Try case-insensitive display name match
+ profile_directory = None
+ profile_lower = profile.lower()
+ for p in known_profiles:
+ if p['name'].lower() == profile_lower:
+ profile_directory = p['directory']
+ break
+ # Also try case-insensitive directory name match
+ if profile_directory is None:
+ for d in directory_names:
+ if d.lower() == profile_lower:
+ profile_directory = d
+ break
+
+ if profile_directory is None:
+ lines = [f'Unknown profile {profile!r}. Available profiles:']
+ for p in known_profiles:
+ lines.append(f' "{p["name"]}" ({p["directory"]})')
+ raise RuntimeError('\n'.join(lines))
+
+ return CLIBrowserSession(
+ executable_path=chrome_path, # type: ignore[call-arg]
+ user_data_dir=user_data_dir, # type: ignore[call-arg]
+ profile_directory=profile_directory, # type: ignore[call-arg]
+ headless=not headed, # type: ignore[call-arg]
+ )
diff --git a/browser_use/skill_cli/tunnel.py b/browser_use/skill_cli/tunnel.py
index 53fd55f55..0f77fe34f 100644
--- a/browser_use/skill_cli/tunnel.py
+++ b/browser_use/skill_cli/tunnel.py
@@ -7,7 +7,7 @@ Tunnels are managed independently of browser sessions - they are purely
a network utility for exposing local ports via Cloudflare quick tunnels.
Tunnels survive CLI process exit by:
-1. Spawning cloudflared as a daemon (start_new_session=True)
+1. Spawning cloudflared as a daemon (start_new_session on Unix, CREATE_NEW_PROCESS_GROUP on Windows)
2. Tracking tunnel info via PID files in ~/.browser-use/tunnels/
"""
@@ -18,6 +18,8 @@ import os
import re
import shutil
import signal
+import subprocess
+import sys
from pathlib import Path
from typing import Any
@@ -26,8 +28,12 @@ logger = logging.getLogger(__name__)
# Pattern to extract tunnel URL from cloudflared output
_URL_PATTERN = re.compile(r'(https://\S+\.trycloudflare\.com)')
-# Directory for tunnel PID files
-_TUNNELS_DIR = Path.home() / '.browser-use' / 'tunnels'
+
+def _tunnels_dir() -> Path:
+ """Get tunnel metadata directory (lazy to respect BROWSER_USE_HOME)."""
+ from browser_use.skill_cli.utils import get_tunnel_dir
+
+ return get_tunnel_dir()
class TunnelManager:
@@ -67,12 +73,6 @@ class TunnelManager:
'Then retry: browser-use tunnel '
)
- def is_available(self) -> bool:
- """Check if cloudflared is available."""
- if self._binary_path:
- return True
- return shutil.which('cloudflared') is not None
-
def get_status(self) -> dict[str, Any]:
"""Get tunnel capability status for doctor command."""
system_binary = shutil.which('cloudflared')
@@ -111,12 +111,12 @@ def get_tunnel_manager() -> TunnelManager:
def _get_tunnel_file(port: int) -> Path:
"""Get the path to a tunnel's info file."""
- return _TUNNELS_DIR / f'{port}.json'
+ return _tunnels_dir() / f'{port}.json'
def _save_tunnel_info(port: int, pid: int, url: str) -> None:
"""Save tunnel info to disk."""
- _TUNNELS_DIR.mkdir(parents=True, exist_ok=True)
+ _tunnels_dir().mkdir(parents=True, exist_ok=True)
_get_tunnel_file(port).write_text(json.dumps({'port': port, 'pid': pid, 'url': url}))
@@ -146,29 +146,43 @@ def _delete_tunnel_info(port: int) -> None:
def _is_process_alive(pid: int) -> bool:
"""Check if a process is still running."""
- try:
- os.kill(pid, 0)
- return True
- except (OSError, ProcessLookupError):
- return False
+ from browser_use.skill_cli.utils import is_process_alive
+
+ return is_process_alive(pid)
def _kill_process(pid: int) -> bool:
"""Kill a process by PID. Returns True if killed, False if already dead."""
- try:
- os.kill(pid, signal.SIGTERM)
- # Give it a moment to terminate gracefully
- for _ in range(10):
- if not _is_process_alive(pid):
- return True
- import time
+ import time
- time.sleep(0.1)
- # Force kill if still alive
- os.kill(pid, signal.SIGKILL)
- return True
- except (OSError, ProcessLookupError):
- return False
+ if sys.platform == 'win32':
+ import ctypes
+
+ PROCESS_TERMINATE = 0x0001
+ handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
+ if not handle:
+ return False
+ try:
+ ctypes.windll.kernel32.TerminateProcess(handle, 1)
+ for _ in range(10):
+ if not _is_process_alive(pid):
+ return True
+ time.sleep(0.1)
+ return not _is_process_alive(pid)
+ finally:
+ ctypes.windll.kernel32.CloseHandle(handle)
+ else:
+ try:
+ os.kill(pid, signal.SIGTERM)
+ for _ in range(10):
+ if not _is_process_alive(pid):
+ return True
+ time.sleep(0.1)
+ # Force kill if still alive
+ os.kill(pid, signal.SIGKILL)
+ return True
+ except (OSError, ProcessLookupError):
+ return False
# =============================================================================
@@ -200,13 +214,19 @@ async def start_tunnel(port: int) -> dict[str, Any]:
return {'error': str(e)}
# Create log file for cloudflared stderr (avoids SIGPIPE when parent exits)
- _TUNNELS_DIR.mkdir(parents=True, exist_ok=True)
- log_file_path = _TUNNELS_DIR / f'{port}.log'
+ _tunnels_dir().mkdir(parents=True, exist_ok=True)
+ log_file_path = _tunnels_dir() / f'{port}.log'
log_file = open(log_file_path, 'w') # noqa: ASYNC230
# Spawn cloudflared as a daemon
- # - start_new_session=True: survives parent exit
+ # - start_new_session / creationflags: survives parent exit
# - stderr to file: avoids SIGPIPE when parent's pipe closes
+ spawn_kwargs: dict[str, Any] = {}
+ if sys.platform == 'win32':
+ spawn_kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP | subprocess.CREATE_NO_WINDOW
+ else:
+ spawn_kwargs['start_new_session'] = True
+
process = await asyncio.create_subprocess_exec(
cloudflared_binary,
'tunnel',
@@ -214,7 +234,7 @@ async def start_tunnel(port: int) -> dict[str, Any]:
f'http://localhost:{port}',
stdout=asyncio.subprocess.DEVNULL,
stderr=log_file,
- start_new_session=True,
+ **spawn_kwargs,
)
# Poll the log file until we find the tunnel URL
@@ -268,8 +288,8 @@ def list_tunnels() -> dict[str, Any]:
Dict with 'tunnels' list and 'count'
"""
tunnels = []
- if _TUNNELS_DIR.exists():
- for tunnel_file in _TUNNELS_DIR.glob('*.json'):
+ if _tunnels_dir().exists():
+ for tunnel_file in _tunnels_dir().glob('*.json'):
try:
port = int(tunnel_file.stem)
info = _load_tunnel_info(port)
@@ -298,7 +318,7 @@ async def stop_tunnel(port: int) -> dict[str, Any]:
_kill_process(pid)
_delete_tunnel_info(port)
# Clean up log file
- log_file = _TUNNELS_DIR / f'{port}.log'
+ log_file = _tunnels_dir() / f'{port}.log'
log_file.unlink(missing_ok=True)
logger.info(f'Tunnel stopped: localhost:{port}')
@@ -312,8 +332,8 @@ async def stop_all_tunnels() -> dict[str, Any]:
Dict with 'stopped' list of ports
"""
stopped = []
- if _TUNNELS_DIR.exists():
- for tunnel_file in _TUNNELS_DIR.glob('*.json'):
+ if _tunnels_dir().exists():
+ for tunnel_file in _tunnels_dir().glob('*.json'):
try:
port = int(tunnel_file.stem)
result = await stop_tunnel(port)
diff --git a/browser_use/skill_cli/utils.py b/browser_use/skill_cli/utils.py
index c999bb89c..4b4314fa7 100644
--- a/browser_use/skill_cli/utils.py
+++ b/browser_use/skill_cli/utils.py
@@ -1,52 +1,21 @@
-"""Platform utilities for CLI and server."""
+"""Platform utilities for CLI and daemon."""
-import hashlib
+import json as _json
import os
import platform
-import signal
+import re
import subprocess
import sys
-import tempfile
+import urllib.request
+import zlib
from pathlib import Path
-from typing import IO
-
-import portalocker
-def get_socket_path(session: str) -> str:
- """Get socket path for session.
+def is_process_alive(pid: int) -> bool:
+ """Check if a process is still running.
- On Windows, returns a TCP address (tcp://127.0.0.1:PORT).
- On Unix, returns a Unix socket path.
- """
- if sys.platform == 'win32':
- # Windows: use TCP on deterministic port (49152-65535)
- # Use 127.0.0.1 explicitly (not localhost) to avoid IPv6 binding issues
- port = 49152 + (int(hashlib.md5(session.encode()).hexdigest()[:4], 16) % 16383)
- return f'tcp://127.0.0.1:{port}'
- return str(Path(tempfile.gettempdir()) / f'browser-use-{session}.sock')
-
-
-def get_pid_path(session: str) -> Path:
- """Get PID file path for session."""
- return Path(tempfile.gettempdir()) / f'browser-use-{session}.pid'
-
-
-def get_log_path(session: str) -> Path:
- """Get log file path for session."""
- return Path(tempfile.gettempdir()) / f'browser-use-{session}.log'
-
-
-def get_lock_path(session: str) -> Path:
- """Get lock file path for session."""
- return Path(tempfile.gettempdir()) / f'browser-use-{session}.lock'
-
-
-def _pid_exists(pid: int) -> bool:
- """Check if a process with given PID exists.
-
- On Windows, uses ctypes to call OpenProcess (os.kill doesn't work reliably).
- On Unix, uses os.kill(pid, 0) which is the standard approach.
+ On Windows, os.kill(pid, 0) calls TerminateProcess — so we use
+ OpenProcess via ctypes instead.
"""
if sys.platform == 'win32':
import ctypes
@@ -61,145 +30,54 @@ def _pid_exists(pid: int) -> bool:
try:
os.kill(pid, 0)
return True
- except OSError:
+ except (OSError, ProcessLookupError):
return False
-def is_server_running(session: str) -> bool:
- """Check if server is running for session."""
- pid_path = get_pid_path(session)
- if not pid_path.exists():
- return False
- try:
- pid = int(pid_path.read_text().strip())
- return _pid_exists(pid)
- except (OSError, ValueError):
- # Can't read PID file or invalid PID
- return False
+def validate_session_name(session: str) -> None:
+ """Validate session name — reject path traversal and special characters.
-
-def try_acquire_server_lock(session: str) -> IO | None:
- """Try to acquire the server lock non-blocking.
-
- Returns:
- Lock file handle if acquired (caller must keep in scope to maintain lock),
- None if lock is already held by another process.
+ Raises ValueError on invalid name.
"""
- lock_path = get_lock_path(session)
- lock_path.parent.mkdir(parents=True, exist_ok=True)
- lock_path.touch(exist_ok=True)
-
- lock_file = open(lock_path, 'r+')
- try:
- portalocker.lock(lock_file, portalocker.LOCK_EX | portalocker.LOCK_NB)
- return lock_file
- except portalocker.LockException:
- lock_file.close()
- return None
+ if not re.match(r'^[a-zA-Z0-9_-]+$', session):
+ raise ValueError(f'Invalid session name {session!r}: only letters, digits, hyphens, and underscores allowed')
-def is_session_locked(session: str) -> bool:
- """Check if session has an active lock (server is holding it)."""
- lock_path = get_lock_path(session)
- if not lock_path.exists():
- return False
+def get_home_dir() -> Path:
+ """Get the browser-use home directory (~/.browser-use/).
- try:
- with open(lock_path, 'r+') as f:
- portalocker.lock(f, portalocker.LOCK_EX | portalocker.LOCK_NB)
- portalocker.unlock(f)
- return False # Lock acquired = no one holding it
- except portalocker.LockException:
- return True # Lock failed = someone holding it
- except OSError:
- return False # File access error
-
-
-def kill_orphaned_server(session: str) -> bool:
- """Kill an orphaned server (has PID file but no lock).
-
- An orphaned server is one where the process is running but it doesn't
- hold the session lock (e.g., because a newer server took over the lock
- file but didn't kill the old process).
-
- Returns:
- True if an orphan was found and killed.
+ All CLI-managed files live here: config, sockets, PIDs, binaries, tunnels.
+ Override with BROWSER_USE_HOME env var.
"""
- pid_path = get_pid_path(session)
- if not pid_path.exists():
- return False
-
- # Check if session is locked (server alive and holding lock)
- if is_session_locked(session):
- return False # Not an orphan - server is healthy
-
- # PID exists but no lock - orphan situation
- try:
- pid = int(pid_path.read_text().strip())
- if _pid_exists(pid):
- # Kill the orphaned process
- if sys.platform == 'win32':
- import ctypes
-
- PROCESS_TERMINATE = 1
- handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
- if handle:
- ctypes.windll.kernel32.TerminateProcess(handle, 1)
- ctypes.windll.kernel32.CloseHandle(handle)
- else:
- os.kill(pid, signal.SIGKILL)
- return True
- except (OSError, ValueError):
- pass
-
- # Clean up stale files even if we couldn't kill (process may be gone)
- cleanup_session_files(session)
- return False
+ env = os.environ.get('BROWSER_USE_HOME')
+ if env:
+ d = Path(env).expanduser()
+ else:
+ d = Path.home() / '.browser-use'
+ d.mkdir(parents=True, exist_ok=True)
+ return d
-def find_all_sessions() -> list[str]:
- """Find all running browser-use sessions by scanning PID files."""
- sessions = []
- tmpdir = Path(tempfile.gettempdir())
- for pid_file in tmpdir.glob('browser-use-*.pid'):
- # Extract session name from filename: browser-use-{session}.pid
- name = pid_file.stem.replace('browser-use-', '', 1)
- if is_server_running(name):
- sessions.append(name)
- return sessions
+def get_socket_path(session: str = 'default') -> str:
+ """Get daemon socket path for a session.
+
+ On Windows, returns a TCP address (tcp://127.0.0.1:PORT).
+ On Unix, returns a Unix socket path.
+ """
+ if sys.platform == 'win32':
+ port = 49152 + zlib.adler32(session.encode()) % 16383
+ return f'tcp://127.0.0.1:{port}'
+ return str(get_home_dir() / f'{session}.sock')
-def cleanup_session_files(session: str) -> None:
- """Remove session socket, PID, lock, and metadata files."""
- sock_path = get_socket_path(session)
- pid_path = get_pid_path(session)
- lock_path = get_lock_path(session)
- meta_path = Path(tempfile.gettempdir()) / f'browser-use-{session}.meta'
+def get_pid_path(session: str = 'default') -> Path:
+ """Get PID file path for a session."""
+ return get_home_dir() / f'{session}.pid'
- # Remove socket file (Unix only)
- if not sock_path.startswith('tcp://'):
- try:
- os.unlink(sock_path)
- except OSError:
- pass
- # Remove PID file
- try:
- pid_path.unlink()
- except OSError:
- pass
-
- # Remove lock file
- try:
- lock_path.unlink()
- except OSError:
- pass
-
- # Remove metadata file
- try:
- meta_path.unlink()
- except OSError:
- pass
+def get_auth_token_path(session: str = 'default') -> Path:
+ """Get auth token file path for a session."""
+ return get_home_dir() / f'{session}.token'
def find_chrome_executable() -> str | None:
@@ -252,7 +130,11 @@ def get_chrome_profile_path(profile: str | None) -> str | None:
if system == 'Darwin':
return str(Path.home() / 'Library' / 'Application Support' / 'Google' / 'Chrome')
elif system == 'Linux':
- return str(Path.home() / '.config' / 'google-chrome')
+ base = Path.home() / '.config'
+ for name in ('google-chrome', 'chromium'):
+ if (base / name).is_dir():
+ return str(base / name)
+ return str(base / 'google-chrome')
elif system == 'Windows':
return os.path.expandvars(r'%LocalAppData%\Google\Chrome\User Data')
else:
@@ -263,15 +145,181 @@ def get_chrome_profile_path(profile: str | None) -> str | None:
return None
-def get_config_dir() -> Path:
- """Get browser-use config directory."""
- if sys.platform == 'win32':
- base = Path(os.environ.get('APPDATA', Path.home()))
- else:
- base = Path(os.environ.get('XDG_CONFIG_HOME', Path.home() / '.config'))
- return base / 'browser-use'
+def get_chrome_user_data_dirs() -> list[Path]:
+ """Return candidate Chrome/Chromium user-data directories for the current OS.
+
+ Covers Google Chrome, Chrome Canary, Chromium, and Brave on macOS/Linux/Windows.
+ """
+ system = platform.system()
+ home = Path.home()
+ candidates: list[Path] = []
+
+ if system == 'Darwin':
+ base = home / 'Library' / 'Application Support'
+ for name in ('Google/Chrome', 'Google/Chrome Canary', 'Chromium', 'BraveSoftware/Brave-Browser'):
+ candidates.append(base / name)
+ elif system == 'Linux':
+ base = home / '.config'
+ for name in ('google-chrome', 'google-chrome-unstable', 'chromium', 'BraveSoftware/Brave-Browser'):
+ candidates.append(base / name)
+ elif system == 'Windows':
+ local_app_data = os.environ.get('LOCALAPPDATA', str(home / 'AppData' / 'Local'))
+ base = Path(local_app_data)
+ for name in (
+ 'Google\\Chrome\\User Data',
+ 'Google\\Chrome SxS\\User Data',
+ 'Chromium\\User Data',
+ 'BraveSoftware\\Brave-Browser\\User Data',
+ ):
+ candidates.append(base / name)
+
+ return [d for d in candidates if d.is_dir()]
+
+
+def discover_chrome_cdp_url() -> str:
+ """Auto-discover a running Chrome instance's CDP WebSocket URL.
+
+ Strategy:
+ 1. Read ``DevToolsActivePort`` from known Chrome data dirs.
+ 2. Probe ``/json/version`` via HTTP to get ``webSocketDebuggerUrl``.
+ 3. If HTTP fails, construct ``ws://`` URL directly from the port file.
+ 4. Fallback: probe well-known port 9222.
+
+ Raises ``RuntimeError`` if no running Chrome with remote debugging is found.
+ """
+
+ def _probe_http(port: int) -> str | None:
+ """Try GET http://127.0.0.1:{port}/json/version and return webSocketDebuggerUrl."""
+ try:
+ req = urllib.request.Request(f'http://127.0.0.1:{port}/json/version')
+ with urllib.request.urlopen(req, timeout=2) as resp:
+ data = _json.loads(resp.read())
+ url = data.get('webSocketDebuggerUrl')
+ if url and isinstance(url, str):
+ return url
+ except Exception:
+ pass
+ return None
+
+ def _port_is_open(port: int) -> bool:
+ """Check if something is listening on 127.0.0.1:{port}."""
+ import socket
+
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ try:
+ s.settimeout(1)
+ s.connect(('127.0.0.1', port))
+ return True
+ except OSError:
+ return False
+ finally:
+ s.close()
+
+ # --- Phase 1: DevToolsActivePort files ---
+ for data_dir in get_chrome_user_data_dirs():
+ port_file = data_dir / 'DevToolsActivePort'
+ if not port_file.is_file():
+ continue
+ try:
+ lines = port_file.read_text().strip().splitlines()
+ if not lines:
+ continue
+ port = int(lines[0].strip())
+ ws_path = lines[1].strip() if len(lines) > 1 else '/devtools/browser'
+ except (ValueError, OSError):
+ continue
+
+ # Try HTTP probe first (gives us the full canonical URL)
+ ws_url = _probe_http(port)
+ if ws_url:
+ return ws_url
+
+ # HTTP may not respond (Chrome M144+), but if the port is open, trust the file
+ if _port_is_open(port):
+ return f'ws://127.0.0.1:{port}{ws_path}'
+
+ # --- Phase 2: well-known fallback ports ---
+ for port in (9222,):
+ ws_url = _probe_http(port)
+ if ws_url:
+ return ws_url
+
+ raise RuntimeError(
+ 'Could not discover a running Chrome instance with remote debugging enabled.\n'
+ 'Enable remote debugging in Chrome (chrome://inspect/#remote-debugging, or launch with --remote-debugging-port=9222) and try again.'
+ )
+
+
+def list_chrome_profiles() -> list[dict[str, str]]:
+ """List available Chrome profiles with their names.
+
+ Returns:
+ List of dicts with 'directory' and 'name' keys, ex:
+ [{'directory': 'Default', 'name': 'Person 1'}, {'directory': 'Profile 1', 'name': 'Work'}]
+ """
+ import json
+
+ user_data_dir = get_chrome_profile_path(None)
+ if user_data_dir is None:
+ return []
+
+ local_state_path = Path(user_data_dir) / 'Local State'
+ if not local_state_path.exists():
+ return []
+
+ try:
+ with open(local_state_path, encoding='utf-8') as f:
+ local_state = json.load(f)
+
+ info_cache = local_state.get('profile', {}).get('info_cache', {})
+ profiles = []
+ for directory, info in info_cache.items():
+ profiles.append(
+ {
+ 'directory': directory,
+ 'name': info.get('name', directory),
+ }
+ )
+ return sorted(profiles, key=lambda p: p['directory'])
+ except (json.JSONDecodeError, KeyError, OSError):
+ return []
def get_config_path() -> Path:
"""Get browser-use config file path."""
- return get_config_dir() / 'config.json'
+ return get_home_dir() / 'config.json'
+
+
+def get_bin_dir() -> Path:
+ """Get directory for CLI-managed binaries."""
+ d = get_home_dir() / 'bin'
+ d.mkdir(parents=True, exist_ok=True)
+ return d
+
+
+def get_tunnel_dir() -> Path:
+ """Get directory for tunnel metadata and logs."""
+ return get_home_dir() / 'tunnels'
+
+
+def migrate_legacy_paths() -> None:
+ """One-time migration of config from old XDG location to ~/.browser-use/.
+
+ Copies (not moves) config.json if old location exists and new location does not.
+ """
+ new_config = get_home_dir() / 'config.json'
+ if new_config.exists():
+ return
+
+ # Check old XDG location
+ if sys.platform == 'win32':
+ old_base = Path(os.environ.get('APPDATA', Path.home()))
+ else:
+ old_base = Path(os.environ.get('XDG_CONFIG_HOME', Path.home() / '.config'))
+ old_config = old_base / 'browser-use' / 'config.json'
+
+ if old_config.exists():
+ import shutil
+
+ shutil.copy2(str(old_config), str(new_config))
+ print(f'Migrated config from {old_config} to {new_config}', file=sys.stderr)
diff --git a/browser_use/skills/service.py b/browser_use/skills/service.py
index a9da16ffe..256ff0572 100644
--- a/browser_use/skills/service.py
+++ b/browser_use/skills/service.py
@@ -4,9 +4,7 @@ import logging
import os
from typing import Any, Literal
-from browser_use_sdk import AsyncBrowserUse
-from browser_use_sdk.types.execute_skill_response import ExecuteSkillResponse
-from browser_use_sdk.types.skill_list_response import SkillListResponse
+from browser_use_sdk import AsyncBrowserUse, ExecuteSkillResponse, SkillListResponse
from cdp_use.cdp.network import Cookie
from pydantic import BaseModel, ValidationError
@@ -89,7 +87,7 @@ class SkillService:
all_items.extend(skills_response.items)
# Check if we've found all requested skills
- found_ids = {s.id for s in all_items if s.id in requested_ids}
+ found_ids = {str(s.id) for s in all_items if str(s.id) in requested_ids}
if found_ids == requested_ids:
break
@@ -114,10 +112,10 @@ class SkillService:
skills_to_load = all_available_skills
else:
# Load only the requested skill IDs
- skills_to_load = [skill for skill in all_available_skills if skill.id in requested_ids]
+ skills_to_load = [skill for skill in all_available_skills if str(skill.id) in requested_ids]
# Warn about any requested skills that weren't found
- found_ids = {skill.id for skill in skills_to_load}
+ found_ids = {str(skill.id) for skill in skills_to_load}
missing_ids = requested_ids - found_ids
if missing_ids:
logger.warning(f'Requested skills not found or not available: {missing_ids}')
@@ -272,7 +270,10 @@ class SkillService:
# Return error response
return ExecuteSkillResponse(
success=False,
+ result=None,
error=f'Failed to execute skill: {type(e).__name__}: {str(e)}',
+ stderr=None,
+ latencyMs=None,
)
async def close(self) -> None:
diff --git a/browser_use/skills/views.py b/browser_use/skills/views.py
index 9c44376bf..2421c3942 100644
--- a/browser_use/skills/views.py
+++ b/browser_use/skills/views.py
@@ -2,8 +2,7 @@
from typing import Any
-from browser_use_sdk.types.parameter_schema import ParameterSchema
-from browser_use_sdk.types.skill_response import SkillResponse
+from browser_use_sdk import ParameterSchema, SkillResponse
from pydantic import BaseModel, ConfigDict, Field
@@ -40,7 +39,7 @@ class Skill(BaseModel):
def from_skill_response(response: SkillResponse) -> 'Skill':
"""Create a Skill from SDK SkillResponse"""
return Skill(
- id=response.id,
+ id=str(response.id),
title=response.title,
description=response.description,
parameters=response.parameters,
diff --git a/browser_use/telemetry/service.py b/browser_use/telemetry/service.py
index 2cc5bcf5d..c3edfdc9d 100644
--- a/browser_use/telemetry/service.py
+++ b/browser_use/telemetry/service.py
@@ -42,7 +42,7 @@ class ProductTelemetry:
if telemetry_disabled:
self._posthog_client = None
else:
- logger.info('Using anonymized telemetry, see https://docs.browser-use.com/development/telemetry.')
+ logger.info('Using anonymized telemetry, see https://docs.browser-use.com/development/monitoring/telemetry.')
self._posthog_client = Posthog(
project_api_key=self.PROJECT_API_KEY,
host=self.HOST,
diff --git a/browser_use/telemetry/views.py b/browser_use/telemetry/views.py
index 00283b764..252b9c879 100644
--- a/browser_use/telemetry/views.py
+++ b/browser_use/telemetry/views.py
@@ -33,7 +33,7 @@ class AgentTelemetryEvent(BaseTelemetryEvent):
version: str
source: str
cdp_url: str | None
- agent_type: str | None # 'code' for CodeAgent, None for regular Agent
+ agent_type: str | None
# step details
action_errors: Sequence[str | None]
action_history: Sequence[list[dict] | None]
diff --git a/browser_use/tokens/service.py b/browser_use/tokens/service.py
index ad5508f2c..8c3a25fae 100644
--- a/browser_use/tokens/service.py
+++ b/browser_use/tokens/service.py
@@ -50,10 +50,11 @@ class TokenCost:
CACHE_DIR_NAME = 'browser_use/token_cost'
CACHE_DURATION = timedelta(days=1)
- PRICING_URL = 'https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json'
+ DEFAULT_PRICING_URL = 'https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json'
- def __init__(self, include_cost: bool = False):
+ def __init__(self, include_cost: bool = False, pricing_url: str | None = None):
self.include_cost = include_cost or os.getenv('BROWSER_USE_CALCULATE_COST', 'false').lower() == 'true'
+ self.pricing_url = pricing_url or CONFIG.BROWSER_USE_MODEL_PRICING_URL or self.DEFAULT_PRICING_URL
self.usage_history: list[TokenUsageEntry] = []
self.registered_llms: dict[str, BaseChatModel] = {}
@@ -95,9 +96,10 @@ class TokenCost:
# Check each file until we find a valid one
for cache_file in cache_files:
- if await self._is_cache_valid(cache_file):
+ is_valid, should_delete = await self._get_cache_status(cache_file)
+ if is_valid:
return cache_file
- else:
+ if should_delete:
# Clean up old cache files
try:
os.remove(cache_file)
@@ -108,19 +110,30 @@ class TokenCost:
except Exception:
return None
- async def _is_cache_valid(self, cache_file: Path) -> bool:
- """Check if a specific cache file is valid and not expired"""
+ async def _get_cache_status(self, cache_file: Path) -> tuple[bool, bool]:
+ """Return whether a cache file is usable and whether it should be deleted."""
try:
if not cache_file.exists():
- return False
+ return False, False
# Read the cached data
cached = CachedPricingData.model_validate_json(await anyio.Path(cache_file).read_text())
# Check if cache is still valid
- return datetime.now() - cached.timestamp < self.CACHE_DURATION
+ if datetime.now() - cached.timestamp >= self.CACHE_DURATION:
+ return False, True
+
+ # Keep caches from other sources so different pricing URLs don't delete each other.
+ return self._cache_source_matches(cached), False
except Exception:
- return False
+ return False, True
+
+ def _cache_source_matches(self, cached: CachedPricingData) -> bool:
+ """Only use cached pricing files from the same source URL."""
+ if cached.source_url is None:
+ return self.pricing_url == self.DEFAULT_PRICING_URL
+
+ return cached.source_url == self.pricing_url
async def _load_from_cache(self, cache_file: Path) -> None:
"""Load pricing data from a specific cache file"""
@@ -137,13 +150,13 @@ class TokenCost:
"""Fetch pricing data from LiteLLM GitHub and cache it with timestamp"""
try:
async with httpx.AsyncClient() as client:
- response = await client.get(self.PRICING_URL, timeout=30)
+ response = await client.get(self.pricing_url, timeout=30)
response.raise_for_status()
self._pricing_data = response.json()
# Create cache object with timestamp
- cached = CachedPricingData(timestamp=datetime.now(), data=self._pricing_data or {})
+ cached = CachedPricingData(timestamp=datetime.now(), source_url=self.pricing_url, data=self._pricing_data or {})
# Ensure cache directory exists
self._cache_dir.mkdir(parents=True, exist_ok=True)
@@ -250,9 +263,7 @@ class TokenCost:
# ANSI color codes
C_CYAN = '\033[96m'
- C_YELLOW = '\033[93m'
C_GREEN = '\033[92m'
- C_BLUE = '\033[94m'
C_RESET = '\033[0m'
# Always get cost breakdown for token details (even if not showing costs)
@@ -356,9 +367,9 @@ class TokenCost:
return result
- # Replace the method with our tracked version
- # Using setattr to avoid type checking issues with overloaded methods
- setattr(llm, 'ainvoke', tracked_ainvoke)
+ # Replace the method with our tracked version.
+ # Use setattr so Pydantic-backed models don't reject runtime patch
+ object.__setattr__(llm, 'ainvoke', tracked_ainvoke)
return llm
@@ -402,7 +413,6 @@ class TokenCost:
total_completion = sum(u.usage.completion_tokens for u in filtered_usage)
total_tokens = total_prompt + total_completion
total_prompt_cached = sum(u.usage.prompt_cached_tokens or 0 for u in filtered_usage)
- models = list({u.model for u in filtered_usage})
# Calculate per-model stats with record-by-record cost calculation
model_stats: dict[str, ModelUsageStats] = {}
@@ -555,19 +565,32 @@ class TokenCost:
await self._fetch_and_cache_pricing_data()
async def clean_old_caches(self, keep_count: int = 3) -> None:
- """Clean up old cache files, keeping only the most recent ones"""
+ """Clean up old cache files, keeping only the most recent ones from this source URL"""
try:
# List all JSON files in the cache directory
cache_files = list(self._cache_dir.glob('*.json'))
- if len(cache_files) <= keep_count:
+ if not cache_files:
+ return
+
+ # Only consider cache files from the same source URL
+ own_files: list[Path] = []
+ for cache_file in cache_files:
+ try:
+ cached = CachedPricingData.model_validate_json(cache_file.read_text())
+ if self._cache_source_matches(cached):
+ own_files.append(cache_file)
+ except Exception:
+ pass
+
+ if len(own_files) <= keep_count:
return
# Sort by modification time (oldest first)
- cache_files.sort(key=lambda f: f.stat().st_mtime)
+ own_files.sort(key=lambda f: f.stat().st_mtime)
# Remove all but the most recent files
- for cache_file in cache_files[:-keep_count]:
+ for cache_file in own_files[:-keep_count]:
try:
os.remove(cache_file)
except Exception:
diff --git a/browser_use/tokens/views.py b/browser_use/tokens/views.py
index dfa55179a..5a3deb14f 100644
--- a/browser_use/tokens/views.py
+++ b/browser_use/tokens/views.py
@@ -65,6 +65,7 @@ class CachedPricingData(BaseModel):
"""Cached pricing data with timestamp"""
timestamp: datetime
+ source_url: str | None = None
data: dict[str, Any]
diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py
index 3e9a48b05..64db52033 100644
--- a/browser_use/tools/service.py
+++ b/browser_use/tools/service.py
@@ -47,7 +47,7 @@ from browser_use.tools.views import (
InputTextAction,
NavigateAction,
NoParamsAction,
- ReadContentAction,
+ SaveAsPdfAction,
ScreenshotAction,
ScrollAction,
SearchAction,
@@ -207,9 +207,16 @@ try {
if (ATTRIBUTES && ATTRIBUTES.length > 0) {
item.attrs = {};
for (var j = 0; j < ATTRIBUTES.length; j++) {
- var val = el.getAttribute(ATTRIBUTES[j]);
+ var attrName = ATTRIBUTES[j];
+ var val;
+ // Use resolved DOM property for src/href to get absolute URLs
+ if ((attrName === 'src' || attrName === 'href') && typeof el[attrName] === 'string' && el[attrName] !== '') {
+ val = el[attrName];
+ } else {
+ val = el.getAttribute(attrName);
+ }
if (val !== null) {
- item.attrs[ATTRIBUTES[j]] = val.length > 500 ? val.slice(0, 500) + '...' : val;
+ item.attrs[attrName] = val.length > 500 ? val.slice(0, 500) + '...' : val;
}
}
}
@@ -415,6 +422,38 @@ class Tools(Generic[Context]):
await event
await event.event_result(raise_if_any=True, raise_if_none=False)
+ # Health check: detect empty DOM for http/https pages and retry once.
+ # Uses _root is None (truly blank) OR empty llm_representation() (no actionable
+ # content for the LLM, e.g. SPA not yet rendered, empty body).
+ # NOTE: llm_representation() returns a non-empty placeholder when _root is None,
+ # so we must check _root is None separately — not rely on the repr string alone.
+ def _page_appears_empty(s) -> bool:
+ return s.dom_state._root is None or not s.dom_state.llm_representation().strip()
+
+ if not params.new_tab:
+ state = await browser_session.get_browser_state_summary(include_screenshot=False)
+ url_is_http = state.url.lower().startswith(('http://', 'https://'))
+ if url_is_http and _page_appears_empty(state):
+ browser_session.logger.warning(
+ f'⚠️ Empty DOM detected after navigation to {params.url}, waiting 3s and rechecking...'
+ )
+ await asyncio.sleep(3.0)
+ state = await browser_session.get_browser_state_summary(include_screenshot=False)
+ if state.url.lower().startswith(('http://', 'https://')) and _page_appears_empty(state):
+ # Second attempt: reload the page and wait longer
+ browser_session.logger.warning(f'⚠️ Still empty after 3s, attempting page reload for {params.url}...')
+ reload_event = browser_session.event_bus.dispatch(NavigateToUrlEvent(url=params.url, new_tab=False))
+ await reload_event
+ await reload_event.event_result(raise_if_any=False, raise_if_none=False)
+ await asyncio.sleep(5.0)
+ state = await browser_session.get_browser_state_summary(include_screenshot=False)
+ if state.url.lower().startswith(('http://', 'https://')) and state.dom_state._root is None:
+ return ActionResult(
+ error=f'Page loaded but returned empty content for {params.url}. '
+ f'The page may require JavaScript that failed to render, use anti-bot measures, '
+ f'or have a connection issue (e.g. tunnel/proxy error). Try a different URL or approach.'
+ )
+
if params.new_tab:
memory = f'Opened new tab with URL {params.url}'
msg = f'🔗 Opened new tab with url {params.url}'
@@ -441,6 +480,7 @@ class Tools(Generic[Context]):
'ERR_INTERNET_DISCONNECTED',
'ERR_CONNECTION_REFUSED',
'ERR_TIMED_OUT',
+ 'ERR_TUNNEL_CONNECTION_FAILED',
'net::',
]
):
@@ -501,9 +541,7 @@ class Tools(Generic[Context]):
browser_session: BrowserSession,
tabs_before: set[str],
) -> str:
- """Detect if a click opened a new tab, and return a note for the agent.
- Waits briefly for CDP events to propagate, then checks if any new tabs appeared.
- """
+ """Detect if a click opened a new tab and automatically switch to it."""
try:
# Brief delay to allow CDP Target.attachedToTarget events to propagate
# and be processed by SessionManager._handle_target_attached
@@ -512,8 +550,16 @@ class Tools(Generic[Context]):
tabs_after = await browser_session.get_tabs()
new_tabs = [t for t in tabs_after if t.target_id not in tabs_before]
if new_tabs:
- new_tab_id = new_tabs[0].target_id[-4:]
- return f'. Note: This opened a new tab (tab_id: {new_tab_id}) - switch to it if you need to interact with the new page.'
+ new_tab = new_tabs[0]
+ new_tab_id = new_tab.target_id[-4:]
+ # Auto-switch to the new tab so the agent can immediately interact with it
+ try:
+ switch_event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=new_tab.target_id))
+ await switch_event
+ await switch_event.event_result(raise_if_any=False, raise_if_none=False)
+ return f'. Automatically switched to new tab (tab_id: {new_tab_id}).'
+ except Exception:
+ return f'. Note: This opened a new tab (tab_id: {new_tab_id}) - switch to it if you need to interact with the new page.'
except Exception:
pass
return ''
@@ -633,7 +679,7 @@ class Tools(Generic[Context]):
self._register_click_action()
@self.registry.action(
- 'Input text into element by index.',
+ 'Input text into element by index. Clears existing text by default; pass text="" to clear only, or clear=False to append.',
param_model=InputTextAction,
)
async def input(
@@ -774,49 +820,8 @@ class Tools(Generic[Context]):
node = selector_map[params.index]
- # Helper function to find file input near the selected element
- def find_file_input_near_element(
- node: EnhancedDOMTreeNode, max_height: int = 3, max_descendant_depth: int = 3
- ) -> EnhancedDOMTreeNode | None:
- """Find the closest file input to the selected element."""
-
- def find_file_input_in_descendants(n: EnhancedDOMTreeNode, depth: int) -> EnhancedDOMTreeNode | None:
- if depth < 0:
- return None
- if browser_session.is_file_input(n):
- return n
- for child in n.children_nodes or []:
- result = find_file_input_in_descendants(child, depth - 1)
- if result:
- return result
- return None
-
- current = node
- for _ in range(max_height + 1):
- # Check the current node itself
- if browser_session.is_file_input(current):
- return current
- # Check all descendants of the current node
- result = find_file_input_in_descendants(current, max_descendant_depth)
- if result:
- return result
- # Check all siblings and their descendants
- if current.parent_node:
- for sibling in current.parent_node.children_nodes or []:
- if sibling is current:
- continue
- if browser_session.is_file_input(sibling):
- return sibling
- result = find_file_input_in_descendants(sibling, max_descendant_depth)
- if result:
- return result
- current = current.parent_node
- if not current:
- break
- return None
-
# Try to find a file input element near the selected element
- file_input_node = find_file_input_near_element(node)
+ file_input_node = browser_session.find_file_input_near_element(node)
# Highlight the file input element if found (truly non-blocking)
if file_input_node:
@@ -945,7 +950,7 @@ class Tools(Generic[Context]):
)
@self.registry.action(
- """LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Use start_from_char if previous extraction was truncated to extract data further down the page.""",
+ """LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Set extract_images=True for image src URLs. Use start_from_char if previous extraction was truncated to extract data further down the page. When paginating across pages, pass already_collected with item identifiers (names/URLs) from prior pages to avoid duplicates.""",
param_model=ExtractAction,
)
async def extract(
@@ -959,8 +964,17 @@ class Tools(Generic[Context]):
MAX_CHAR_LIMIT = 100000
query = params['query'] if isinstance(params, dict) else params.query
extract_links = params['extract_links'] if isinstance(params, dict) else params.extract_links
+ extract_images = params.get('extract_images', False) if isinstance(params, dict) else params.extract_images
start_from_char = params['start_from_char'] if isinstance(params, dict) else params.start_from_char
output_schema: dict | None = params.get('output_schema') if isinstance(params, dict) else params.output_schema
+ already_collected: list[str] = (
+ params.get('already_collected', []) if isinstance(params, dict) else params.already_collected
+ )
+
+ # Auto-enable extract_images if query contains image-related keywords
+ _IMAGE_KEYWORDS = ['image', 'photo', 'picture', 'thumbnail', 'img url', 'image url', 'photo url', 'product image']
+ if not extract_images and any(kw in query.lower() for kw in _IMAGE_KEYWORDS):
+ extract_images = True
# If the LLM didn't provide an output_schema, use the agent-injected extraction_schema
if output_schema is None and extraction_schema is not None:
@@ -982,7 +996,7 @@ class Tools(Generic[Context]):
from browser_use.dom.markdown_extractor import extract_clean_markdown
content, content_stats = await extract_clean_markdown(
- browser_session=browser_session, extract_links=extract_links
+ browser_session=browser_session, extract_links=extract_links, extract_images=extract_images
)
except Exception as e:
raise RuntimeError(f'Could not extract clean markdown: {type(e).__name__}')
@@ -1047,15 +1061,20 @@ You will be given a query, a JSON Schema, and the markdown of a webpage that has
- Your response MUST conform to the provided JSON Schema exactly.
- If a required field's value cannot be found on the page, use null (if the schema allows it) or an empty string / empty array as appropriate.
- If the content was truncated, extract what is available from the visible portion.
+- If items are provided, skip any items whose name/title/URL matches those listed — do not include duplicates.
""".strip()
schema_json = json.dumps(output_schema, indent=2)
+ already_collected_section = ''
+ if already_collected:
+ items_str = '\n'.join(f'- {item}' for item in already_collected[:100])
+ already_collected_section = f'\n\n\nSkip items whose name/title/URL matches any of these already-collected identifiers:\n{items_str}\n '
prompt = (
f'\n{query}\n \n\n'
f'\n{schema_json}\n \n\n'
f'\n{stats_summary}\n \n\n'
- f'\n{content}\n '
+ f'\n{content}\n ' + already_collected_section
)
try:
@@ -1119,6 +1138,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
- If the information relevant to the query is not available in the page, your response should mention that.
- If the query asks for all items, products, etc., make sure to directly list all of them.
- If the content was truncated and you need more information, note that the user can use start_from_char parameter to continue from where truncation occurred.
+- If items are provided, exclude any results whose name/title/URL matches those already collected — do not include duplicates.
@@ -1127,7 +1147,14 @@ You will be given a query and the markdown of a webpage that has been filtered t
""".strip()
- prompt = f'\n{query}\n \n\n\n{stats_summary}\n \n\n\n{content}\n '
+ already_collected_section = ''
+ if already_collected:
+ items_str = '\n'.join(f'- {item}' for item in already_collected[:100])
+ already_collected_section = f'\n\n\nSkip items whose name/title/URL matches any of these already-collected identifiers:\n{items_str}\n '
+ prompt = (
+ f'\n{query}\n \n\n\n{stats_summary}\n \n\n\n{content}\n '
+ + already_collected_section
+ )
try:
response = await asyncio.wait_for(
@@ -1417,6 +1444,96 @@ You will be given a query and the markdown of a webpage that has been filtered t
metadata={'include_screenshot': True},
)
+ # PDF Actions
+
+ @self.registry.action(
+ 'Save the current page as a PDF file. Returns the file path of the saved PDF. '
+ 'Use this to capture the full page content (including content below the fold) as a printable document.',
+ param_model=SaveAsPdfAction,
+ )
+ async def save_as_pdf(
+ params: SaveAsPdfAction,
+ browser_session: BrowserSession,
+ file_system: FileSystem,
+ ):
+ """Save the current page as a PDF using CDP Page.printToPDF."""
+ import base64
+ import re
+
+ # Paper format dimensions in inches (width, height)
+ paper_sizes: dict[str, tuple[float, float]] = {
+ 'letter': (8.5, 11),
+ 'legal': (8.5, 14),
+ 'a4': (8.27, 11.69),
+ 'a3': (11.69, 16.54),
+ 'tabloid': (11, 17),
+ }
+
+ paper_key = params.paper_format.lower()
+ if paper_key not in paper_sizes:
+ paper_key = 'letter'
+ paper_width, paper_height = paper_sizes[paper_key]
+
+ cdp_session = await browser_session.get_or_create_cdp_session(focus=True)
+
+ result = await asyncio.wait_for(
+ cdp_session.cdp_client.send.Page.printToPDF(
+ params={
+ 'printBackground': params.print_background,
+ 'landscape': params.landscape,
+ 'scale': params.scale,
+ 'paperWidth': paper_width,
+ 'paperHeight': paper_height,
+ 'preferCSSPageSize': True,
+ },
+ session_id=cdp_session.session_id,
+ ),
+ timeout=30.0,
+ )
+
+ pdf_data = result.get('data')
+ assert pdf_data, 'CDP Page.printToPDF returned no data'
+
+ pdf_bytes = base64.b64decode(pdf_data)
+
+ # Determine filename
+ if params.file_name:
+ file_name = params.file_name
+ else:
+ try:
+ page_title = await asyncio.wait_for(browser_session.get_current_page_title(), timeout=2.0)
+ safe_title = re.sub(r'[^\w\s-]', '', page_title).strip()[:50]
+ file_name = safe_title if safe_title else 'page'
+ except Exception:
+ file_name = 'page'
+
+ if not file_name.lower().endswith('.pdf'):
+ file_name = f'{file_name}.pdf'
+ file_name = FileSystem.sanitize_filename(file_name)
+
+ file_path = file_system.get_dir() / file_name
+ # Handle duplicate filenames
+ if file_path.exists():
+ base, ext = os.path.splitext(file_name)
+ counter = 1
+ while (file_system.get_dir() / f'{base} ({counter}){ext}').exists():
+ counter += 1
+ file_name = f'{base} ({counter}){ext}'
+ file_path = file_system.get_dir() / file_name
+
+ async with await anyio.open_file(file_path, 'wb') as f:
+ await f.write(pdf_bytes)
+
+ file_size = file_path.stat().st_size
+ msg = f'Saved page as PDF: {file_name} ({file_size:,} bytes)'
+ logger.info(f'📄 {msg}. Full path: {file_path}')
+
+ return ActionResult(
+ extracted_content=msg,
+ long_term_memory=f'{msg}. Full path: {file_path}',
+ attachments=[str(file_path)],
+ )
+
# Dropdown Actions
@self.registry.action(
@@ -1571,277 +1688,9 @@ You will be given a query and the markdown of a webpage that has been filtered t
include_extracted_content_only_once=True,
)
- # Intelligent content reading
-
@self.registry.action(
- 'Intelligently read long content to find specific information. Works on current page (source="page") or files. For large content, uses search to identify relevant sections. Best for long articles, documents, or any content where you know what you are looking for.',
- param_model=ReadContentAction,
- )
- async def read_long_content(
- params: ReadContentAction,
- browser_session: BrowserSession,
- page_extraction_llm: BaseChatModel,
- available_file_paths: list[str],
- ):
- import re
-
- from browser_use.llm.messages import UserMessage
-
- goal = params.goal
- context = params.context
- source = params.source
- max_chars = 50000
-
- async def extract_search_terms(goal: str, context: str) -> list[str]:
- """Use LLM to extract search terms from goal."""
- prompt = f"""Extract 3-5 key search terms from this goal that would help find relevant sections.
-Return only the terms, one per line, no numbering or bullets.
-
-Goal: {goal}
-
-Context: {context}"""
- response = await page_extraction_llm.ainvoke([UserMessage(content=prompt)])
- return [term.strip() for term in response.completion.strip().split('\n') if term.strip()][:5]
-
- def search_text(content: str, pattern: str, context_chars: int = 100) -> list[dict]:
- """Search content for pattern, return matches with positions."""
- try:
- regex = re.compile(pattern, re.IGNORECASE)
- except re.error:
- regex = re.compile(re.escape(pattern), re.IGNORECASE)
-
- matches = []
- for match in regex.finditer(content):
- start = max(0, match.start() - context_chars)
- end = min(len(content), match.end() + context_chars)
- matches.append(
- {
- 'position': match.start(),
- 'snippet': content[start:end],
- }
- )
- return matches
-
- def chunk_content(content: str, chunk_size: int = 2000) -> list[dict]:
- """Split content into chunks with positions."""
- chunks = []
- for i in range(0, len(content), chunk_size):
- chunks.append(
- {
- 'start': i,
- 'end': min(i + chunk_size, len(content)),
- 'text': content[i : i + chunk_size],
- }
- )
- return chunks
-
- try:
- if source.lower() == 'page':
- # Read from current webpage
- from browser_use.dom.markdown_extractor import extract_clean_markdown
-
- # Clear DOM cache and wait for page to settle before extracting
- if browser_session._dom_watchdog:
- browser_session._dom_watchdog.clear_cache()
-
- wait_time = browser_session.browser_profile.wait_for_network_idle_page_load_time
- await asyncio.sleep(wait_time)
-
- content, _ = await extract_clean_markdown(browser_session=browser_session, extract_links=False)
- source_name = 'current page'
-
- if not content:
- return ActionResult(
- extracted_content='Error: No page content available',
- long_term_memory='Failed to read page: no content',
- )
-
- else:
- # Read from file
- file_path = source
-
- # Validate file path against whitelist (available_file_paths + downloaded files)
- allowed_paths = set(available_file_paths or [])
- allowed_paths.update(browser_session.downloaded_files)
- if file_path not in allowed_paths:
- return ActionResult(
- extracted_content=f'Error: File path not in available_file_paths: {file_path}. '
- f'The user must add this path to available_file_paths when creating the Agent.',
- long_term_memory=f'Failed to read: file path not allowed: {file_path}',
- )
-
- if not os.path.exists(file_path):
- return ActionResult(
- extracted_content=f'Error: File not found: {file_path}',
- long_term_memory='Failed to read: file not found',
- )
-
- ext = os.path.splitext(file_path)[1].lower()
- source_name = os.path.basename(file_path)
-
- if ext == '.pdf':
- # Read PDF directly using pypdf
- import pypdf
-
- reader = pypdf.PdfReader(file_path)
- num_pages = len(reader.pages)
-
- # Extract all page text
- page_texts: list[str] = []
- total_chars = 0
- for page in reader.pages:
- text = page.extract_text() or ''
- page_texts.append(text)
- total_chars += len(text)
-
- # If PDF is small enough, return it all
- if total_chars <= max_chars:
- content_parts = []
- for i, text in enumerate(page_texts, 1):
- if text.strip():
- content_parts.append(f'--- Page {i} ---\n{text}')
- content = '\n\n'.join(content_parts)
-
- memory = f'Read {source_name} ({num_pages} pages, {total_chars:,} chars) for goal: {goal[:50]}'
- logger.info(f'📄 {memory}')
- return ActionResult(
- extracted_content=f'PDF: {source_name} ({num_pages} pages)\n\n{content}',
- long_term_memory=memory,
- include_extracted_content_only_once=True,
- )
-
- # PDF too large - use intelligent extraction
- logger.info(f'PDF has {total_chars:,} chars across {num_pages} pages, using intelligent extraction')
-
- # Extract search terms from goal
- search_terms = await extract_search_terms(goal, context)
-
- # Search and score pages by relevance
- page_scores: dict[int, int] = {} # 1-indexed page -> score
- for term in search_terms:
- try:
- term_pattern = re.compile(re.escape(term), re.IGNORECASE)
- except re.error:
- continue
- for i, text in enumerate(page_texts, 1):
- if term_pattern.search(text):
- page_scores[i] = page_scores.get(i, 0) + 1
-
- # Select pages: always include page 1, then most relevant
- pages_to_read = [1]
- sorted_pages = sorted(page_scores.items(), key=lambda x: -x[1])
- for page_num, _ in sorted_pages:
- if page_num not in pages_to_read:
- pages_to_read.append(page_num)
-
- # Build result respecting char limit, truncating pages if needed
- content_parts = []
- chars_used = 0
- pages_included = []
- for page_num in sorted(set(pages_to_read)):
- text = page_texts[page_num - 1]
- page_header = f'--- Page {page_num} ---\n'
- remaining = max_chars - chars_used
- if remaining < len(page_header) + 50:
- break # no room for meaningful content
- page_content = page_header + text
- if len(page_content) > remaining:
- page_content = page_content[: remaining - len('\n[...truncated]')] + '\n[...truncated]'
- content_parts.append(page_content)
- chars_used += len(page_content)
- pages_included.append(page_num)
-
- content = '\n\n'.join(content_parts)
- memory = f'Read {source_name} ({len(pages_included)} relevant pages of {num_pages}) for goal: {goal[:50]}'
- logger.info(f'📄 {memory}')
- return ActionResult(
- extracted_content=f'PDF: {source_name} ({num_pages} pages, showing {len(pages_included)} relevant)\n\n{content}',
- long_term_memory=memory,
- include_extracted_content_only_once=True,
- )
-
- else:
- # Text file
- async with await anyio.open_file(file_path, 'r', encoding='utf-8', errors='ignore') as f:
- content = await f.read()
-
- # Check if content fits in budget
- if len(content) <= max_chars:
- memory = f'Read {source_name} ({len(content):,} chars) for goal: {goal[:50]}'
- logger.info(f'📄 {memory}')
- return ActionResult(
- extracted_content=f'Content from {source_name} ({len(content):,} chars):\n\n{content}',
- long_term_memory=memory,
- include_extracted_content_only_once=True,
- )
-
- # Content too large - use intelligent extraction
- logger.info(f'Content has {len(content):,} chars, using intelligent extraction')
-
- # Extract search terms from goal
- search_terms = await extract_search_terms(goal, context)
-
- # Search for each term and score chunks
- chunks = chunk_content(content, chunk_size=2000)
- chunk_scores: dict[int, int] = {} # chunk index -> relevance score
-
- for term in search_terms:
- matches = search_text(content, term)
- for match in matches:
- # Find which chunk this match belongs to
- for i, chunk in enumerate(chunks):
- if chunk['start'] <= match['position'] < chunk['end']:
- chunk_scores[i] = chunk_scores.get(i, 0) + 1
- break
-
- if not chunk_scores:
- # No matches - return first max_chars
- truncated = content[:max_chars]
- memory = f'Read {source_name} (truncated to {max_chars:,} chars, no matches for search terms)'
- logger.info(f'📄 {memory}')
- return ActionResult(
- extracted_content=f'Content from {source_name} (first {max_chars:,} of {len(content):,} chars):\n\n{truncated}',
- long_term_memory=memory,
- include_extracted_content_only_once=True,
- )
-
- # Sort chunks by relevance and collect most relevant ones
- sorted_chunks = sorted(chunk_scores.items(), key=lambda x: -x[1])
-
- # Always include first chunk for context
- selected_indices = {0} # Start with first chunk
- for chunk_idx, _ in sorted_chunks:
- selected_indices.add(chunk_idx)
-
- # Build result from selected chunks in order
- result_parts = []
- total_chars = 0
- for i in sorted(selected_indices):
- chunk = chunks[i]
- if total_chars + len(chunk['text']) > max_chars:
- break
- if i > 0 and (i - 1) not in selected_indices:
- result_parts.append('\n[...]\n') # Indicate gap
- result_parts.append(chunk['text'])
- total_chars += len(chunk['text'])
-
- result_content = ''.join(result_parts)
- memory = f'Read {source_name} ({len(selected_indices)} relevant sections of {len(chunks)}) for goal: {goal[:50]}'
- logger.info(f'📄 {memory}')
-
- return ActionResult(
- extracted_content=f'Content from {source_name} (relevant sections, {total_chars:,} of {len(content):,} chars):\n\n{result_content}',
- long_term_memory=memory,
- include_extracted_content_only_once=True,
- )
-
- except Exception as e:
- error_msg = f'Error reading content: {str(e)}'
- logger.error(error_msg)
- return ActionResult(extracted_content=error_msg, long_term_memory=error_msg)
-
- @self.registry.action(
- """Execute browser JavaScript. Best practice: wrap in IIFE (function(){...})() with try-catch for safety. Use ONLY browser APIs (document, window, DOM). NO Node.js APIs (fs, require, process). Example: (function(){try{const el=document.querySelector('#id');return el?el.value:'not found'}catch(e){return 'Error: '+e.message}})() Avoid comments. Use for hover, drag, zoom, custom selectors, extract/filter links, shadow DOM, or analysing page structure. Limit output size.""",
+ """Execute browser JavaScript. Best practice: wrap in IIFE (function(){...})() with try-catch for safety. Use ONLY browser APIs (document, window, DOM). NO Node.js APIs (fs, require, process). Example: (function(){try{const el=document.querySelector('#id');return el?el.value:'not found'}catch(e){return 'Error: '+e.message}})() Avoid comments. Use for hover, drag, zoom, custom selectors, extract/filter links, or analysing page structure. IMPORTANT: Shadow DOM elements with [index] markers can be clicked directly with click(index) — do NOT use evaluate() to click them. Only use evaluate for shadow DOM elements that are NOT indexed. Limit output size.""",
+ terminates_sequence=True,
)
async def evaluate(code: str, browser_session: BrowserSession):
# Execute JavaScript with proper error handling and promise support
@@ -2022,22 +1871,41 @@ Validated Code (after quote fixing):
'Complete task with structured output.',
param_model=StructuredOutputAction[output_model],
)
- async def done(params: StructuredOutputAction):
+ async def done(params: StructuredOutputAction, file_system: FileSystem, browser_session: BrowserSession):
# Exclude success from the output JSON
# Use mode='json' to properly serialize enums at all nesting levels
output_dict = params.data.model_dump(mode='json')
+ attachments: list[str] = []
+
+ # 1. Resolve any explicitly requested files via files_to_display
+ if params.files_to_display:
+ for file_name in params.files_to_display:
+ file_content = file_system.display_file(file_name)
+ if file_content:
+ attachments.append(str(file_system.get_dir() / file_name))
+
+ # 2. Auto-attach actual session downloads (CDP-tracked browser downloads)
+ # but NOT user-supplied whitelist paths from available_file_paths
+ session_downloads = browser_session.downloaded_files
+ if session_downloads:
+ existing = set(attachments)
+ for file_path in session_downloads:
+ if file_path not in existing:
+ attachments.append(file_path)
+
return ActionResult(
is_done=True,
success=params.success,
extracted_content=json.dumps(output_dict, ensure_ascii=False),
long_term_memory=f'Task completed. Success Status: {params.success}',
+ attachments=attachments,
)
else:
@self.registry.action(
- 'Complete task.',
+ 'Complete task. Only report actions you performed and data you extracted in this session.',
param_model=DoneAction,
)
async def done(params: DoneAction, file_system: FileSystem):
@@ -2290,306 +2158,3 @@ Validated Code (after quote fixing):
# Alias for backwards compatibility
Controller = Tools
-
-
-class CodeAgentTools(Tools[Context]):
- """Specialized Tools for CodeAgent agent optimized for Python-based browser automation.
-
- Includes:
- - All browser interaction tools (click, input, scroll, navigate, etc.)
- - JavaScript evaluation
- - Tab management (switch, close)
- - Navigation actions (go_back)
- - Upload file support
- - Dropdown interactions
-
- Excludes (optimized for code-use mode):
- - extract: Use Python + evaluate() instead
- - find_text: Use Python string operations
- - screenshot: Not needed in code-use mode
- - search: Use navigate() directly
- - File system actions (write_file, read_file, replace_file): Use Python file operations instead
- """
-
- def __init__(
- self,
- exclude_actions: list[str] | None = None,
- output_model: type[T] | None = None,
- display_files_in_done_text: bool = True,
- ):
- # Default exclusions for CodeAgent agent
- if exclude_actions is None:
- exclude_actions = [
- # 'scroll', # Keep for code-use
- 'extract', # Exclude - use Python + evaluate()
- 'find_text', # Exclude - use Python string ops
- # 'select_dropdown', # Keep for code-use
- # 'dropdown_options', # Keep for code-use
- 'screenshot', # Exclude - not needed
- 'search', # Exclude - use navigate() directly
- # 'click', # Keep for code-use
- # 'input', # Keep for code-use
- # 'switch', # Keep for code-use
- # 'send_keys', # Keep for code-use
- # 'close', # Keep for code-use
- # 'go_back', # Keep for code-use
- # 'upload_file', # Keep for code-use
- # Exclude file system actions - CodeAgent should use Python file operations
- 'write_file',
- 'read_file',
- 'replace_file',
- ]
-
- super().__init__(
- exclude_actions=exclude_actions,
- output_model=output_model,
- display_files_in_done_text=display_files_in_done_text,
- )
-
- # Override done action for CodeAgent with enhanced file handling
- self._register_code_use_done_action(output_model, display_files_in_done_text)
-
- def _register_code_use_done_action(self, output_model: type[T] | None, display_files_in_done_text: bool = True):
- """Register enhanced done action for CodeAgent that can read files from disk."""
- if output_model is not None:
- # Structured output done - use parent's implementation
- return
-
- # Override the done action with enhanced version
- @self.registry.action(
- 'Complete task.',
- param_model=DoneAction,
- )
- async def done(params: DoneAction, file_system: FileSystem):
- user_message = params.text
-
- len_text = len(params.text)
- len_max_memory = 100
- memory = f'Task completed: {params.success} - {params.text[:len_max_memory]}'
- if len_text > len_max_memory:
- memory += f' - {len_text - len_max_memory} more characters'
-
- attachments = []
- if params.files_to_display:
- if self.display_files_in_done_text:
- file_msg = ''
- for file_name in params.files_to_display:
- file_content = file_system.display_file(file_name)
- if file_content:
- file_msg += f'\n\n{file_name}:\n{file_content}'
- attachments.append(file_name)
- elif os.path.exists(file_name):
- # File exists on disk but not in FileSystem - just add to attachments
- attachments.append(file_name)
- if file_msg:
- user_message += '\n\nAttachments:'
- user_message += file_msg
- else:
- logger.warning('Agent wanted to display files but none were found')
- else:
- for file_name in params.files_to_display:
- file_content = file_system.display_file(file_name)
- if file_content:
- attachments.append(file_name)
- elif os.path.exists(file_name):
- attachments.append(file_name)
-
- # Convert relative paths to absolute paths - handle both FileSystem-managed and regular files
- resolved_attachments = []
- for file_name in attachments:
- if os.path.isabs(file_name):
- # Already absolute
- resolved_attachments.append(file_name)
- elif file_system.get_file(file_name):
- # Managed by FileSystem
- resolved_attachments.append(str(file_system.get_dir() / file_name))
- elif os.path.exists(file_name):
- # Regular file in current directory
- resolved_attachments.append(os.path.abspath(file_name))
- else:
- # File doesn't exist, but include the path anyway for error visibility
- resolved_attachments.append(str(file_system.get_dir() / file_name))
- attachments = resolved_attachments
-
- return ActionResult(
- is_done=True,
- success=params.success,
- extracted_content=user_message,
- long_term_memory=memory,
- attachments=attachments,
- )
-
- # Override upload_file for code agent with relaxed path validation
- @self.registry.action(
- 'Upload a file to a file input element. For code-use mode, any file accessible from the current directory can be uploaded.',
- param_model=UploadFileAction,
- )
- async def upload_file(
- params: UploadFileAction,
- browser_session: BrowserSession,
- available_file_paths: list[str],
- file_system: FileSystem,
- ):
- # Path validation logic for code-use mode:
- # 1. If available_file_paths provided (security mode), enforce it as a whitelist
- # 2. If no whitelist, for local browsers just check file exists
- # 3. For remote browsers, allow any path (assume it exists remotely)
-
- # If whitelist provided, validate path is in it
- if available_file_paths:
- if params.path not in available_file_paths:
- # Also check if it's a recently downloaded file
- downloaded_files = browser_session.downloaded_files
- if params.path not in downloaded_files:
- # Finally, check if it's a file in the FileSystem service (if provided)
- if file_system is not None and file_system.get_dir():
- # Check if the file is actually managed by the FileSystem service
- # The path should be just the filename for FileSystem files
- file_obj = file_system.get_file(params.path)
- if file_obj:
- # File is managed by FileSystem, construct the full path
- file_system_path = str(file_system.get_dir() / params.path)
- params = UploadFileAction(index=params.index, path=file_system_path)
- else:
- # If browser is remote, allow passing a remote-accessible absolute path
- if not browser_session.is_local:
- pass
- else:
- msg = f'File path {params.path} is not available. To fix: add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
- logger.error(f'❌ {msg}')
- return ActionResult(error=msg)
- else:
- # If browser is remote, allow passing a remote-accessible absolute path
- if not browser_session.is_local:
- pass
- else:
- msg = f'File path {params.path} is not available. To fix: add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
- logger.error(f'❌ {msg}')
- return ActionResult(error=msg)
-
- # For local browsers, ensure the file exists on the local filesystem
- if browser_session.is_local:
- if not os.path.exists(params.path):
- msg = f'File {params.path} does not exist'
- return ActionResult(error=msg)
-
- # Get the selector map to find the node
- selector_map = await browser_session.get_selector_map()
- if params.index not in selector_map:
- msg = f'Element with index {params.index} does not exist.'
- return ActionResult(error=msg)
-
- node = selector_map[params.index]
-
- # Helper function to find file input near the selected element
- def find_file_input_near_element(
- node: EnhancedDOMTreeNode, max_height: int = 3, max_descendant_depth: int = 3
- ) -> EnhancedDOMTreeNode | None:
- """Find the closest file input to the selected element."""
-
- def find_file_input_in_descendants(n: EnhancedDOMTreeNode, depth: int) -> EnhancedDOMTreeNode | None:
- if depth < 0:
- return None
- if browser_session.is_file_input(n):
- return n
- for child in n.children_nodes or []:
- result = find_file_input_in_descendants(child, depth - 1)
- if result:
- return result
- return None
-
- current = node
- for _ in range(max_height + 1):
- # Check the current node itself
- if browser_session.is_file_input(current):
- return current
- # Check all descendants of the current node
- result = find_file_input_in_descendants(current, max_descendant_depth)
- if result:
- return result
- # Check all siblings and their descendants
- if current.parent_node:
- for sibling in current.parent_node.children_nodes or []:
- if sibling is current:
- continue
- if browser_session.is_file_input(sibling):
- return sibling
- result = find_file_input_in_descendants(sibling, max_descendant_depth)
- if result:
- return result
- current = current.parent_node
- if not current:
- break
- return None
-
- # Try to find a file input element near the selected element
- file_input_node = find_file_input_near_element(node)
-
- # Highlight the file input element if found (truly non-blocking)
- if file_input_node:
- create_task_with_error_handling(
- browser_session.highlight_interaction_element(file_input_node),
- name='highlight_file_input',
- suppress_exceptions=True,
- )
-
- # If not found near the selected element, fallback to finding the closest file input to current scroll position
- if file_input_node is None:
- logger.info(
- f'No file upload element found near index {params.index}, searching for closest file input to scroll position'
- )
-
- # Get current scroll position
- cdp_session = await browser_session.get_or_create_cdp_session()
- try:
- scroll_info = await cdp_session.cdp_client.send.Runtime.evaluate(
- params={'expression': 'window.scrollY || window.pageYOffset || 0'}, session_id=cdp_session.session_id
- )
- current_scroll_y = scroll_info.get('result', {}).get('value', 0)
- except Exception:
- current_scroll_y = 0
-
- # Find all file inputs in the selector map and pick the closest one to scroll position
- closest_file_input = None
- min_distance = float('inf')
-
- for idx, element in selector_map.items():
- if browser_session.is_file_input(element):
- # Get element's Y position
- if element.absolute_position:
- element_y = element.absolute_position.y
- distance = abs(element_y - current_scroll_y)
- if distance < min_distance:
- min_distance = distance
- closest_file_input = element
-
- if closest_file_input:
- file_input_node = closest_file_input
- logger.info(f'Found file input closest to scroll position (distance: {min_distance}px)')
-
- # Highlight the fallback file input element (truly non-blocking)
- create_task_with_error_handling(
- browser_session.highlight_interaction_element(file_input_node),
- name='highlight_file_input_fallback',
- suppress_exceptions=True,
- )
- else:
- msg = 'No file upload element found on the page'
- logger.error(msg)
- raise BrowserError(msg)
- # TODO: figure out why this fails sometimes + add fallback hail mary, just look for any file input on page
-
- # Dispatch upload file event with the file input node
- try:
- event = browser_session.event_bus.dispatch(UploadFileEvent(node=file_input_node, file_path=params.path))
- await event
- await event.event_result(raise_if_any=True, raise_if_none=False)
- msg = f'Successfully uploaded file to index {params.index}'
- logger.info(f'📁 {msg}')
- return ActionResult(
- extracted_content=msg,
- long_term_memory=f'Uploaded file {params.path} to element {params.index}',
- )
- except Exception as e:
- logger.error(f'Failed to upload file: {e}')
- raise BrowserError(f'Failed to upload file: {e}')
diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py
index 27c4ada6e..02b274ed9 100644
--- a/browser_use/tools/views.py
+++ b/browser_use/tools/views.py
@@ -10,6 +10,10 @@ class ExtractAction(BaseModel):
extract_links: bool = Field(
default=False, description='Set True to true if the query requires links, else false to safe tokens'
)
+ extract_images: bool = Field(
+ default=False,
+ description='Set True to include image src URLs in extracted markdown. Auto-enabled when query contains image-related keywords.',
+ )
start_from_char: int = Field(
default=0, description='Use this for long markdowns to start from a specific character (not index in browser_state)'
)
@@ -17,6 +21,10 @@ class ExtractAction(BaseModel):
default=None,
description='Optional JSON Schema dict. When provided, extraction returns validated JSON matching this schema instead of free-text.',
)
+ already_collected: list[str] = Field(
+ default_factory=list,
+ description='Item identifiers (name, URL, or ID) already collected in prior extract calls on other pages. The extractor will skip items matching these to prevent duplicates. Use when paginating across multiple pages.',
+ )
class SearchPageAction(BaseModel):
@@ -74,12 +82,21 @@ class ClickElementActionIndexOnly(BaseModel):
class InputTextAction(BaseModel):
index: int = Field(ge=0, description='from browser_state')
- text: str
- clear: bool = Field(default=True, description='1=clear, 0=append')
+ text: str = Field(description='Text to enter. With clear=True, text="" clears the field without typing.')
+ clear: bool = Field(default=True, description='Clear existing text before typing. Set to False to append instead.')
class DoneAction(BaseModel):
- text: str = Field(description='Final user message in the format the user requested')
+ text: str = Field(
+ description=(
+ 'Final message to the user. '
+ 'ONLY report data you directly observed in browser_state, tool outputs, or screenshots during this session. '
+ 'Do NOT use training knowledge to fill gaps — if information was not found on the page, say so explicitly. '
+ 'Do NOT claim completion of steps from compacted_memory or prior session summaries '
+ 'unless you explicitly verified them yourself. '
+ 'If uncertain whether a prior step completed, say so explicitly.'
+ )
+ )
success: bool = Field(default=True, description='True if user_request completed successfully')
files_to_display: list[str] | None = Field(default=[])
@@ -87,16 +104,19 @@ class DoneAction(BaseModel):
T = TypeVar('T', bound=BaseModel)
-def _hide_success_from_schema(schema: dict) -> None:
- """Remove 'success' from the JSON schema to avoid field name collisions with user models."""
- schema.get('properties', {}).pop('success', None)
+def _hide_internal_fields_from_schema(schema: dict) -> None:
+ """Remove internal fields from the JSON schema to avoid collisions with user models."""
+ props = schema.get('properties', {})
+ props.pop('success', None)
+ props.pop('files_to_display', None)
class StructuredOutputAction(BaseModel, Generic[T]):
- model_config = ConfigDict(json_schema_extra=_hide_success_from_schema)
+ model_config = ConfigDict(json_schema_extra=_hide_internal_fields_from_schema)
success: bool = Field(default=True, description='True if user_request completed successfully')
data: T = Field(description='The actual output data matching the requested schema')
+ files_to_display: list[str] | None = Field(default=[])
class SwitchTabAction(BaseModel):
@@ -138,15 +158,18 @@ class ScreenshotAction(BaseModel):
)
-class ReadContentAction(BaseModel):
- """Action for intelligent reading of long content."""
-
- goal: str = Field(description='What to look for or extract from the content')
- source: str = Field(
- default='page',
- description='What to read: "page" for current webpage, or a file path',
+class SaveAsPdfAction(BaseModel):
+ file_name: str | None = Field(
+ default=None,
+ description='Output PDF filename (without path). Defaults to page title. Extension .pdf is added automatically if missing.',
+ )
+ print_background: bool = Field(default=True, description='Include background graphics and colors')
+ landscape: bool = Field(default=False, description='Use landscape orientation')
+ scale: float = Field(default=1.0, ge=0.1, le=2.0, description='Scale of the webpage rendering (0.1 to 2.0)')
+ paper_format: str = Field(
+ default='Letter',
+ description='Paper size: Letter, Legal, A4, A3, or Tabloid',
)
- context: str = Field(default='', description='Additional context about the task')
class GetDropdownOptionsAction(BaseModel):
diff --git a/browser_use/utils.py b/browser_use/utils.py
index 1baac45d5..a949aa77d 100644
--- a/browser_use/utils.py
+++ b/browser_use/utils.py
@@ -31,6 +31,30 @@ _openai_bad_request_error: type | None = None
_groq_bad_request_error: type | None = None
+def collect_sensitive_data_values(sensitive_data: dict[str, str | dict[str, str]] | None) -> dict[str, str]:
+ """Flatten legacy and domain-scoped sensitive data into placeholder -> value mappings."""
+ if not sensitive_data:
+ return {}
+
+ sensitive_values: dict[str, str] = {}
+ for key_or_domain, content in sensitive_data.items():
+ if isinstance(content, dict):
+ for key, val in content.items():
+ if val:
+ sensitive_values[key] = val
+ elif content:
+ sensitive_values[key_or_domain] = content
+
+ return sensitive_values
+
+
+def redact_sensitive_string(value: str, sensitive_values: dict[str, str]) -> str:
+ """Replace sensitive values with placeholders, longest matches first to avoid partial leaks."""
+ for key, secret in sorted(sensitive_values.items(), key=lambda item: len(item[1]), reverse=True):
+ value = value.replace(secret, f'{key} ')
+ return value
+
+
def _get_openai_bad_request_error() -> type | None:
"""Lazy loader for OpenAI BadRequestError."""
global _openai_bad_request_error
@@ -77,6 +101,7 @@ class SignalHandler:
- Management of event loop state across signals
- Standardized handling of first and second Ctrl+C presses
- Cross-platform compatibility (with simplified behavior on Windows)
+ - Option to disable signal handling for embedding in applications that manage their own signals
"""
def __init__(
@@ -87,6 +112,7 @@ class SignalHandler:
custom_exit_callback: Callable[[], None] | None = None,
exit_on_second_int: bool = True,
interruptible_task_patterns: list[str] | None = None,
+ disabled: bool = False,
):
"""
Initialize the signal handler.
@@ -99,6 +125,8 @@ class SignalHandler:
exit_on_second_int: Whether to exit on second SIGINT (Ctrl+C)
interruptible_task_patterns: List of patterns to match task names that should be
canceled on first Ctrl+C (default: ['step', 'multi_act', 'get_next_action'])
+ disabled: If True, signal handling is disabled and register() is a no-op.
+ Useful when embedding browser-use in applications that manage their own signals.
"""
self.loop = loop or asyncio.get_event_loop()
self.pause_callback = pause_callback
@@ -107,6 +135,7 @@ class SignalHandler:
self.exit_on_second_int = exit_on_second_int
self.interruptible_task_patterns = interruptible_task_patterns or ['step', 'multi_act', 'get_next_action']
self.is_windows = platform.system() == 'Windows'
+ self.disabled = disabled
# Initialize loop state attributes
self._initialize_loop_state()
@@ -121,7 +150,13 @@ class SignalHandler:
setattr(self.loop, 'waiting_for_input', False)
def register(self) -> None:
- """Register signal handlers for SIGINT and SIGTERM."""
+ """Register signal handlers for SIGINT and SIGTERM.
+
+ If disabled=True was passed to __init__, this method does nothing.
+ """
+ if self.disabled:
+ return
+
try:
if self.is_windows:
# On Windows, use simple signal handling with immediate exit on Ctrl+C
@@ -146,7 +181,13 @@ class SignalHandler:
pass
def unregister(self) -> None:
- """Unregister signal handlers and restore original handlers if possible."""
+ """Unregister signal handlers and restore original handlers if possible.
+
+ If disabled=True was passed to __init__, this method does nothing.
+ """
+ if self.disabled:
+ return
+
try:
if self.is_windows:
# On Windows, just restore the original SIGINT handler
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index 10f09abff..000000000
--- a/docs/README.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Docs
-
-The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com).
-
-### Development
-
-Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command
-
-```
-npm i -g mintlify
-```
-
-Run the following command at the root of your documentation (where mint.json is)
-
-```
-mintlify dev
-```
diff --git a/docs/customize/actor/all-parameters.mdx b/docs/customize/actor/all-parameters.mdx
deleted file mode 100644
index 3cfbf59fa..000000000
--- a/docs/customize/actor/all-parameters.mdx
+++ /dev/null
@@ -1,87 +0,0 @@
----
-title: "All Parameters"
-description: "Complete API reference for Browser Actor classes, methods, and parameters including BrowserSession, Page, Element, and Mouse"
-icon: "list"
-mode: "wide"
----
-
-## Browser (BrowserSession)
-
-Main browser session manager.
-
-### Key Methods
-
-```python
-from browser_use import Browser
-
-browser = Browser()
-await browser.start()
-
-# Page management
-page = await browser.new_page("https://example.com")
-pages = await browser.get_pages()
-current = await browser.get_current_page()
-await browser.close_page(page)
-
-# To stop the browser session
-await browser.stop()
-```
-
-### Constructor Parameters
-
-See [Browser Parameters](../browser/all-parameters) for complete configuration options.
-
-## Page
-
-Browser tab/iframe for page-level operations.
-
-### Navigation
-- `goto(url: str)` - Navigate to URL
-- `go_back()`, `go_forward()`, `reload()` - History navigation
-
-### Element Finding
-- `get_elements_by_css_selector(selector: str) -> list[Element]` - CSS selector
-- `get_element(backend_node_id: int) -> Element` - By CDP node ID
-- `get_element_by_prompt(prompt: str, llm) -> Element | None` - AI-powered
-- `must_get_element_by_prompt(prompt: str, llm) -> Element` - AI (raises if not found)
-
-### JavaScript & Controls
-- `evaluate(page_function: str, *args) -> str` - Execute JS (arrow function format)
-- `press(key: str)` - Send keyboard input ("Enter", "Control+A")
-- `set_viewport_size(width: int, height: int)` - Set viewport
-- `screenshot(format='jpeg', quality=None) -> str` - Take screenshot
-
-### Information
-- `get_url() -> str`, `get_title() -> str` - Page info
-- `mouse -> Mouse` - Get mouse interface
-
-### AI Features
-- `extract_content(prompt: str, structured_output: type[T], llm) -> T` - Extract data
-
-## Element
-
-Individual DOM element interactions.
-
-### Interactions
-- `click(button='left', click_count=1, modifiers=None)` - Click element
-- `fill(text: str, clear=True)` - Fill input
-- `hover()`, `focus()` - Mouse/focus actions
-- `check()` - Toggle checkbox/radio
-- `select_option(values: str | list[str])` - Select dropdown options
-- `drag_to(target: Element | Position)` - Drag and drop
-
-### Properties
-- `get_attribute(name: str) -> str | None` - Get attribute
-- `get_bounding_box() -> BoundingBox | None` - Position/size
-- `get_basic_info() -> ElementInfo` - Complete element info
-- `screenshot(format='jpeg') -> str` - Element screenshot
-
-## Mouse
-
-Coordinate-based mouse operations.
-
-### Operations
-- `click(x: int, y: int, button='left', click_count=1)` - Click at coordinates
-- `move(x: int, y: int, steps=1)` - Move mouse
-- `down(button='left')`, `up(button='left')` - Press/release buttons
-- `scroll(x=0, y=0, delta_x=None, delta_y=None)` - Scroll at coordinates
diff --git a/docs/customize/actor/basics.mdx b/docs/customize/actor/basics.mdx
deleted file mode 100644
index 065cd1d86..000000000
--- a/docs/customize/actor/basics.mdx
+++ /dev/null
@@ -1,56 +0,0 @@
----
-title: "Basics"
-description: "Low-level Playwright-like browser automation with direct and full CDP control and precise element interactions"
-icon: "code"
-mode: "wide"
----
-
-## Core Architecture
-
-```mermaid
-graph TD
- A[Browser] --> B[Page]
- B --> C[Element]
- B --> D[Mouse]
- B --> E[AI Features]
- C --> F[DOM Interactions]
- D --> G[Coordinate Operations]
- E --> H[LLM Integration]
-```
-
-### Core Classes
-
-- **Browser** (alias: **BrowserSession**): Main session manager
-- **Page**: Represents a browser tab/iframe
-- **Element**: Individual DOM element operations
-- **Mouse**: Coordinate-based mouse operations
-
-## Basic Usage
-
-```python
-from browser_use import Browser, Agent
-from browser_use.llm.openai.chat import ChatOpenAI
-
-async def main():
- llm = ChatOpenAI(api_key="your-api-key")
- browser = Browser()
- await browser.start()
-
- # 1. Actor: Precise navigation and element interactions
- page = await browser.new_page("https://github.com/login")
- email_input = await page.must_get_element_by_prompt("username field", llm=llm)
- await email_input.fill("your-username")
-
- # 2. Agent: AI-driven complex tasks
- agent = Agent(browser=browser, llm=llm)
- await agent.run("Complete login and navigate to my repositories")
-
- await browser.stop()
-```
-
-## Important Notes
-
-- **Not Playwright**: Actor is built on CDP, not Playwright. The API resembles Playwright as much as possible for easy migration, but is sorta subset.
-- **Immediate Returns**: `get_elements_by_css_selector()` doesn't wait for visibility
-- **Manual Timing**: You handle navigation timing and waiting
-- **JavaScript Format**: `evaluate()` requires arrow function format: `() => {}`
diff --git a/docs/customize/actor/examples.mdx b/docs/customize/actor/examples.mdx
deleted file mode 100644
index df5f2b40b..000000000
--- a/docs/customize/actor/examples.mdx
+++ /dev/null
@@ -1,111 +0,0 @@
----
-title: "Examples"
-description: "Comprehensive examples for Browser Actor automation tasks including forms, JavaScript, mouse operations, and AI features"
-icon: "code-simple"
-mode: "wide"
----
-
-## Page Management
-
-```python
-from browser_use import Browser
-
-browser = Browser()
-await browser.start()
-
-# Create pages
-page = await browser.new_page() # Blank tab
-page = await browser.new_page("https://example.com") # With URL
-
-# Get all pages
-pages = await browser.get_pages()
-current = await browser.get_current_page()
-
-# Close page
-await browser.close_page(page)
-await browser.stop()
-```
-
-## Element Finding & Interactions
-
-```python
-page = await browser.new_page('https://github.com')
-
-# CSS selectors (immediate return)
-elements = await page.get_elements_by_css_selector("input[type='text']")
-buttons = await page.get_elements_by_css_selector("button.submit")
-
-# Element actions
-await elements[0].click()
-await elements[0].fill("Hello World")
-await elements[0].hover()
-
-# Page actions
-await page.press("Enter")
-screenshot = await page.screenshot()
-```
-
-## LLM-Powered Features
-
-```python
-from browser_use.llm.openai.chat import ChatOpenAI
-from pydantic import BaseModel
-
-llm = ChatOpenAI(api_key="your-api-key")
-
-# Find elements using natural language
-button = await page.get_element_by_prompt("login button", llm=llm)
-await button.click()
-
-# Extract structured data
-class ProductInfo(BaseModel):
- name: str
- price: float
-
-product = await page.extract_content(
- "Extract product name and price",
- ProductInfo,
- llm=llm
-)
-```
-
-## JavaScript Execution
-
-```python
-# Simple JavaScript evaluation
-title = await page.evaluate('() => document.title')
-
-# JavaScript with arguments
-result = await page.evaluate('(x, y) => x + y', 10, 20)
-
-# Complex operations
-stats = await page.evaluate('''() => ({
- url: location.href,
- links: document.querySelectorAll('a').length
-})''')
-```
-
-## Mouse Operations
-
-```python
-mouse = await page.mouse
-
-# Click at coordinates
-await mouse.click(x=100, y=200)
-
-# Drag and drop
-await mouse.down()
-await mouse.move(x=500, y=600)
-await mouse.up()
-
-# Scroll
-await mouse.scroll(x=0, y=100, delta_y=-500)
-```
-
-## Best Practices
-
-- Use `asyncio.sleep()` after actions that trigger navigation
-- Check URL/title changes to verify state transitions
-- Always check if elements exist before interaction
-- Implement retry logic for flaky elements
-- Call `browser.stop()` to clean up resources
diff --git a/docs/customize/agent/all-parameters.mdx b/docs/customize/agent/all-parameters.mdx
deleted file mode 100644
index 1cf6ee48e..000000000
--- a/docs/customize/agent/all-parameters.mdx
+++ /dev/null
@@ -1,143 +0,0 @@
----
-title: "All Parameters"
-description: "Complete reference for all agent configuration options"
-icon: "sliders"
-mode: "wide"
----
-
-## Available Parameters
-
-### Core Settings
-- `tools`: Registry of tools the agent can call. Example
-- `skills` (or `skill_ids`): List of skill IDs to load (e.g., `['skill-uuid']` or `['*']` for all). Requires `BROWSER_USE_API_KEY`. Docs
-- `browser`: Browser object where you can specify the browser settings.
-- `output_model_schema`: Pydantic model class for structured output validation. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py)
-
-### Vision & Processing
-- `use_vision` (default: `"auto"`): Vision mode - `"auto"` includes screenshot tool but only uses vision when requested, `True` always includes screenshots, `False` never includes screenshots and excludes screenshot tool
-- `vision_detail_level` (default: `'auto'`): Screenshot detail level - `'low'`, `'high'`, or `'auto'`
-- `page_extraction_llm`: Separate LLM model for page content extraction. You can choose a small & fast model because it only needs to extract text from the page (default: same as `llm`)
-
-### Fallback & Resilience
-- `fallback_llm`: Backup LLM to use when the primary LLM fails. The primary LLM will first exhaust its own retry logic (typically 5 attempts with exponential backoff), and only then switch to the fallback. Triggers on rate limits (429), authentication errors (401), payment/credit errors (402), or server errors (500, 502, 503, 504). Once switched, the fallback is used for the rest of the run. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/fallback_model.py)
-
-### Actions & Behavior
-- `initial_actions`: List of actions to run before the main task without LLM. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/initial_actions.py)
-- `max_actions_per_step` (default: `4`): Maximum actions per step, e.g. for form filling the agent can output 4 fields at once. We execute the actions until the page changes.
-- `max_failures` (default: `3`): Maximum retries for steps with errors
-- `final_response_after_failure` (default: `True`): If True, attempt to force one final model call with intermediate output after max_failures is reached
-- `use_thinking` (default: `True`): Controls whether the agent uses its internal "thinking" field for explicit reasoning steps.
-- `flash_mode` (default: `False`): Fast mode that skips evaluation, next goal and thinking and only uses memory. If `flash_mode` is enabled, it overrides `use_thinking` and disables the thinking process entirely. [Example](https://github.com/browser-use/browser-use/blob/main/examples/getting_started/05_fast_agent.py)
-
-### System Messages
-- `override_system_message`: Completely replace the default system prompt.
-- `extend_system_message`: Add additional instructions to the default system prompt. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_system_prompt.py)
-
-### File & Data Management
-- `save_conversation_path`: Path to save complete conversation history
-- `save_conversation_path_encoding` (default: `'utf-8'`): Encoding for saved conversations
-- `available_file_paths`: List of file paths the agent can access
-- `sensitive_data`: Dictionary of sensitive data to handle carefully. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/sensitive_data.py)
-
-### Visual Output
-- `generate_gif` (default: `False`): Generate GIF of agent actions. Set to `True` or string path
-- `include_attributes`: List of HTML attributes to include in page analysis
-
-### Performance & Limits
-- `max_history_items`: Maximum number of last steps to keep in the LLM memory. If `None`, we keep all steps.
-- `llm_timeout` (default: `90`): Timeout in seconds for LLM calls
-- `step_timeout` (default: `120`): Timeout in seconds for each step
-- `directly_open_url` (default: `True`): If we detect a url in the task, we directly open it.
-
-### Advanced Options
-- `calculate_cost` (default: `False`): Calculate and track API costs
-- `display_files_in_done_text` (default: `True`): Show file information in completion messages
-
-### Backwards Compatibility
-- `controller`: Alias for `tools` for backwards compatibility.
-- `browser_session`: Alias for `browser` for backwards compatibility.
-
----
-
-## Environment Variables
-
-These environment variables can be used to tune agent and browser behavior without code changes. They are particularly useful for debugging, slow networks, or deployment-level tuning.
-
-### Agent Timeouts
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `TIMEOUT_AgentEventBusStop` | `3.0` | Timeout in seconds for the agent's event bus to finish processing pending events during shutdown. |
-
-### Browser Action Timeouts
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `TIMEOUT_NavigateToUrlEvent` | `15.0` | Timeout for page navigation |
-| `TIMEOUT_ClickElementEvent` | `15.0` | Timeout for clicking elements |
-| `TIMEOUT_ClickCoordinateEvent` | `15.0` | Timeout for clicking at coordinates |
-| `TIMEOUT_TypeTextEvent` | `60.0` | Timeout for typing text (longer for large inputs) |
-| `TIMEOUT_ScrollEvent` | `8.0` | Timeout for scrolling |
-| `TIMEOUT_ScrollToTextEvent` | `15.0` | Timeout for scrolling to find text |
-| `TIMEOUT_SendKeysEvent` | `60.0` | Timeout for sending keyboard shortcuts |
-| `TIMEOUT_UploadFileEvent` | `30.0` | Timeout for file uploads |
-| `TIMEOUT_GetDropdownOptionsEvent` | `15.0` | Timeout for fetching dropdown options |
-| `TIMEOUT_SelectDropdownOptionEvent` | `8.0` | Timeout for selecting dropdown option |
-| `TIMEOUT_GoBackEvent` | `15.0` | Timeout for browser back navigation |
-| `TIMEOUT_GoForwardEvent` | `15.0` | Timeout for browser forward navigation |
-| `TIMEOUT_RefreshEvent` | `15.0` | Timeout for page refresh |
-| `TIMEOUT_WaitEvent` | `60.0` | Timeout for explicit wait actions |
-| `TIMEOUT_ScreenshotEvent` | `15.0` | Timeout for taking screenshots |
-| `TIMEOUT_BrowserStateRequestEvent` | `30.0` | Timeout for fetching browser state/DOM |
-
-### Browser Lifecycle Timeouts
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `TIMEOUT_BrowserStartEvent` | `30.0` | Timeout for starting browser session |
-| `TIMEOUT_BrowserStopEvent` | `45.0` | Timeout for stopping browser session |
-| `TIMEOUT_BrowserLaunchEvent` | `30.0` | Timeout for launching browser process |
-| `TIMEOUT_BrowserKillEvent` | `30.0` | Timeout for killing browser process |
-| `TIMEOUT_BrowserConnectedEvent` | `30.0` | Timeout for CDP connection |
-| `TIMEOUT_BrowserStoppedEvent` | `30.0` | Timeout for browser stopped confirmation |
-| `TIMEOUT_BrowserErrorEvent` | `30.0` | Timeout for browser error events |
-
-### Tab Management Timeouts
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `TIMEOUT_SwitchTabEvent` | `10.0` | Timeout for switching tabs |
-| `TIMEOUT_CloseTabEvent` | `10.0` | Timeout for closing tabs |
-| `TIMEOUT_TabCreatedEvent` | `30.0` | Timeout for tab creation events |
-| `TIMEOUT_TabClosedEvent` | `10.0` | Timeout for tab closed events |
-| `TIMEOUT_AgentFocusChangedEvent` | `10.0` | Timeout for focus change events |
-| `TIMEOUT_TargetCrashedEvent` | `10.0` | Timeout for crash events |
-
-### Navigation Event Timeouts
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `TIMEOUT_NavigationStartedEvent` | `30.0` | Timeout for navigation started events |
-| `TIMEOUT_NavigationCompleteEvent` | `30.0` | Timeout for navigation complete events |
-
-### Storage & Download Timeouts
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `TIMEOUT_SaveStorageStateEvent` | `45.0` | Timeout for saving cookies/localStorage |
-| `TIMEOUT_StorageStateSavedEvent` | `30.0` | Timeout for storage save confirmation |
-| `TIMEOUT_LoadStorageStateEvent` | `45.0` | Timeout for loading storage state |
-| `TIMEOUT_StorageStateLoadedEvent` | `30.0` | Timeout for storage load confirmation |
-| `TIMEOUT_FileDownloadedEvent` | `30.0` | Timeout for file download events |
-
-### Example Usage
-
-```bash
-# Increase timeouts for slow network or complex pages
-export TIMEOUT_NavigateToUrlEvent=30.0
-export TIMEOUT_TypeTextEvent=120.0
-export TIMEOUT_BrowserStateRequestEvent=60.0
-
-# Increase agent shutdown timeout
-export TIMEOUT_AgentEventBusStop=10.0
-```
diff --git a/docs/customize/agent/basics.mdx b/docs/customize/agent/basics.mdx
deleted file mode 100644
index 226d99ffb..000000000
--- a/docs/customize/agent/basics.mdx
+++ /dev/null
@@ -1,29 +0,0 @@
----
-title: "Basics"
-description: ""
-icon: "play"
-mode: "wide"
----
-
-
-```python
-from browser_use import Agent, ChatBrowserUse
-
-agent = Agent(
- task="Search for latest news about AI",
- llm=ChatBrowserUse(),
-)
-
-async def main():
- history = await agent.run(max_steps=100)
-```
-
-- `task`: The task you want to automate.
-- `llm`: Your favorite LLM. See Supported Models .
-
-
-The agent is executed using the async `run()` method:
-
-- `max_steps` (default: `100`): Maximum number of steps an agent can take.
-
-Check out all customizable parameters here .
diff --git a/docs/customize/agent/output-format.mdx b/docs/customize/agent/output-format.mdx
deleted file mode 100644
index 391487d5d..000000000
--- a/docs/customize/agent/output-format.mdx
+++ /dev/null
@@ -1,45 +0,0 @@
----
-title: "Output Format"
-description: ""
-icon: "arrow-right-to-bracket"
-mode: "wide"
----
-
-## Agent History
-
-The `run()` method returns an `AgentHistoryList` object with the complete execution history:
-
-```python
-history = await agent.run()
-
-# Access useful information
-history.urls() # List of visited URLs
-history.screenshot_paths() # List of screenshot paths
-history.screenshots() # List of screenshots as base64 strings
-history.action_names() # Names of executed actions
-history.extracted_content() # List of extracted content from all actions
-history.errors() # List of errors (with None for steps without errors)
-history.model_actions() # All actions with their parameters
-history.model_outputs() # All model outputs from history
-history.last_action() # Last action in history
-
-# Analysis methods
-history.final_result() # Get the final extracted content (last step)
-history.is_done() # Check if agent completed successfully
-history.is_successful() # Check if agent completed successfully (returns None if not done)
-history.has_errors() # Check if any errors occurred
-history.model_thoughts() # Get the agent's reasoning process (AgentBrain objects)
-history.action_results() # Get all ActionResult objects from history
-history.action_history() # Get truncated action history with essential fields
-history.number_of_steps() # Get the number of steps in the history
-history.total_duration_seconds() # Get total duration of all steps in seconds
-
-# Structured output (when using output_model_schema)
-history.structured_output # Property that returns parsed structured output
-```
-
-See all helper methods in the [AgentHistoryList source code](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L301).
-
-## Structured Output
-
-For structured output, use the `output_model_schema` parameter with a Pydantic model. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py).
diff --git a/docs/customize/agent/prompting-guide.mdx b/docs/customize/agent/prompting-guide.mdx
deleted file mode 100644
index d3d101b38..000000000
--- a/docs/customize/agent/prompting-guide.mdx
+++ /dev/null
@@ -1,92 +0,0 @@
----
-title: "Prompting Guide"
-description: "Tips and tricks "
-icon: "lightbulb"
----
-
-Prompting can drastically improve performance and solve existing limitations of the library.
-
-### 1. Be Specific vs Open-Ended
-
-**✅ Specific (Recommended)**
-```python
-task = """
-1. Go to https://quotes.toscrape.com/
-2. Use extract action with the query "first 3 quotes with their authors"
-3. Save results to quotes.csv using write_file action
-4. Do a google search for the first quote and find when it was written
-"""
-```
-
-**❌ Open-Ended**
-```python
-task = "Go to web and make money"
-```
-
-### 2. Name Actions Directly
-
-When you know exactly what the agent should do, reference actions by name:
-
-```python
-task = """
-1. Use search action to find "Python tutorials"
-2. Use click to open first result in a new tab
-3. Use scroll action to scroll down 2 pages
-4. Use extract to extract the names of the first 5 items
-5. Wait for 2 seconds if the page is not loaded, refresh it and wait 10 sec
-6. Use send_keys action with "Tab Tab ArrowDown Enter"
-"""
-```
-
-See [Available Tools](/customize/tools/available) for the complete list of actions.
-
-
-### 3. Handle interaction problems via keyboard navigation
-
-Sometimes buttons can't be clicked (you found a bug in the library - open an issue).
-Good news - often you can work around it with keyboard navigation!
-
-```python
-task = """
-If the submit button cannot be clicked:
-1. Use send_keys action with "Tab Tab Enter" to navigate and activate
-2. Or use send_keys with "ArrowDown ArrowDown Enter" for form submission
-"""
-```
-
-
-
-
-### 4. Custom Actions Integration
-
-```python
-# When you have custom actions
-@controller.action("Get 2FA code from authenticator app")
-async def get_2fa_code():
- # Your implementation
- pass
-
-task = """
-Login with 2FA:
-1. Enter username/password
-2. When prompted for 2FA, use get_2fa_code action
-3. NEVER try to extract 2FA codes from the page manually
-4. ALWAYS use the get_2fa_code action for authentication codes
-"""
-```
-
-### 5. Error Recovery
-
-```python
-task = """
-Robust data extraction:
-1. Go to openai.com to find their CEO
-2. If navigation fails due to anti-bot protection:
- - Use google search to find the CEO
-3. If page times out, use go_back and try alternative approach
-"""
-```
-
-
-
-The key to effective prompting is being specific about actions.
diff --git a/docs/customize/browser/all-parameters.mdx b/docs/customize/browser/all-parameters.mdx
deleted file mode 100644
index 27fd82986..000000000
--- a/docs/customize/browser/all-parameters.mdx
+++ /dev/null
@@ -1,135 +0,0 @@
----
-title: "All Parameters"
-description: "Complete reference for all browser configuration options"
-icon: "sliders"
-mode: "wide"
----
-
-
-The `Browser` instance also provides all [Actor](/customize/actor/all-parameters) methods for direct browser control (page management, element interactions, etc.).
-
-
-## Core Settings
-
-- `cdp_url`: CDP URL for connecting to existing browser instance (e.g., `"http://localhost:9222"`)
-
-## Display & Appearance
-
-- `headless` (default: `None`): Run browser without UI. Auto-detects based on display availability (`True`/`False`/`None`)
-- `window_size`: Browser window size for headful mode. Use dict `{'width': 1920, 'height': 1080}` or `ViewportSize` object
-- `window_position` (default: `{'width': 0, 'height': 0}`): Window position from top-left corner in pixels
-- `viewport`: Content area size, same format as `window_size`. Use `{'width': 1280, 'height': 720}` or `ViewportSize` object
-- `no_viewport` (default: `None`): Disable viewport emulation, content fits to window size
-- `device_scale_factor`: Device scale factor (DPI). Set to `2.0` or `3.0` for high-resolution screenshots
-
-## Browser Behavior
-
-- `keep_alive` (default: `None`): Keep browser running after agent completes
-- `allowed_domains`: Restrict navigation to specific domains. Domain pattern formats:
- - `'example.com'` - Matches only `https://example.com/*`
- - `'*.example.com'` - Matches `https://example.com/*` and any subdomain `https://*.example.com/*`
- - `'http*://example.com'` - Matches both `http://` and `https://` protocols
- - `'chrome-extension://*'` - Matches any Chrome extension URL
- - **Security**: Wildcards in TLD (e.g., `example.*`) are **not allowed** for security
- - Use list like `['*.google.com', 'https://example.com', 'chrome-extension://*']`
- - **Performance**: Lists with 100+ domains are automatically optimized to sets for O(1) lookup. Pattern matching is disabled for optimized lists. Both `www.example.com` and `example.com` variants are checked automatically.
-- `prohibited_domains`: Block navigation to specific domains. Uses same pattern formats as `allowed_domains`. When both `allowed_domains` and `prohibited_domains` are set, `allowed_domains` takes precedence. Examples:
- - `['pornhub.com', '*.gambling-site.net']` - Block specific sites and all subdomains
- - `['https://explicit-content.org']` - Block specific protocol/domain combination
- - **Performance**: Lists with 100+ domains are automatically optimized to sets for O(1) lookup (same as `allowed_domains`)
-- `enable_default_extensions` (default: `True`): Load automation extensions (uBlock Origin, cookie handlers, ClearURLs)
-- `cross_origin_iframes` (default: `False`): Enable cross-origin iframe support (may cause complexity)
-- `is_local` (default: `True`): Whether this is a local browser instance. Set to `False` for remote browsers. If we have a `executable_path` set, it will be automatically set to `True`. This can effect your download behavior.
-
-## User Data & Profiles
-
-- `user_data_dir` (default: auto-generated temp): Directory for browser profile data. Use `None` for incognito mode
-- `profile_directory` (default: `'Default'`): Chrome profile subdirectory name (`'Profile 1'`, `'Work Profile'`, etc.)
-- `storage_state`: Browser storage state (cookies, localStorage). Can be file path string or dict object
-
-## Network & Security
-
-- `proxy`: Proxy configuration using `ProxySettings(server='http://host:8080', bypass='localhost,127.0.0.1', username='user', password='pass')`
-- `permissions` (default: `['clipboardReadWrite', 'notifications']`): Browser permissions to grant. Use list like `['camera', 'microphone', 'geolocation']`
-
-- `headers`: Additional HTTP headers for connect requests (remote browsers only)
-
-## Browser Launch
-
-- `executable_path`: Path to browser executable for custom installations. Platform examples:
- - macOS: `'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'`
- - Windows: `'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'`
- - Linux: `'/usr/bin/google-chrome'`
-- `channel`: Browser channel (`'chromium'`, `'chrome'`, `'chrome-beta'`, `'msedge'`, etc.)
-- `args`: Additional command-line arguments for the browser. Use list format: `['--disable-gpu', '--custom-flag=value', '--another-flag']`
-- `env`: Environment variables for browser process. Use dict like `{'DISPLAY': ':0', 'LANG': 'en_US.UTF-8', 'CUSTOM_VAR': 'test'}`
-- `chromium_sandbox` (default: `True` except in Docker): Enable Chromium sandboxing for security
-- `devtools` (default: `False`): Open DevTools panel automatically (requires `headless=False`)
-- `ignore_default_args`: List of default args to disable, or `True` to disable all. Use list like `['--enable-automation', '--disable-extensions']`
-
-## Timing & Performance
-
-- `minimum_wait_page_load_time` (default: `0.25`): Minimum time to wait before capturing page state in seconds
-- `wait_for_network_idle_page_load_time` (default: `0.5`): Time to wait for network activity to cease in seconds
-- `wait_between_actions` (default: `0.5`): Time to wait between agent actions in seconds
-
-## AI Integration
-
-- `highlight_elements` (default: `True`): Highlight interactive elements for AI vision
-- `paint_order_filtering` (default: `True`): Enable paint order filtering to optimize DOM tree by removing elements hidden behind others. Slightly experimental
-
-## Downloads & Files
-
-- `accept_downloads` (default: `True`): Automatically accept all downloads
-- `downloads_path`: Directory for downloaded files. Use string like `'./downloads'` or `Path` object
-- `auto_download_pdfs` (default: `True`): Automatically download PDFs instead of viewing in browser
-
-## Device Emulation
-
-- `user_agent`: Custom user agent string. Example: `'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)'`
-- `screen`: Screen size information, same format as `window_size`
-
-## Recording & Debugging
-
-
-Video recording requires additional optional dependencies. If these are not installed, no video will be saved and no error will be raised.
-
-Install with:
-```bash
-pip install "browser-use[video]"
-```
-or:
-```bash
-pip install imageio[ffmpeg] numpy
-```
-
-
-- `record_video_dir`: Directory to save video recordings as `.mp4` files
-- `record_video_size` (default: `ViewportSize`): The frame size (width, height) of the video recording.
-- `record_video_framerate` (default: `30`): The framerate to use for the video recording.
-- `record_har_path`: Path to save network trace files as `.har` format
-- `traces_dir`: Directory to save complete trace files for debugging
-- `record_har_content` (default: `'embed'`): HAR content mode (`'omit'`, `'embed'`, `'attach'`)
-- `record_har_mode` (default: `'full'`): HAR recording mode (`'full'`, `'minimal'`)
-
-## Advanced Options
-
-- `disable_security` (default: `False`): ⚠️ **NOT RECOMMENDED** - Disables all browser security features
-- `deterministic_rendering` (default: `False`): ⚠️ **NOT RECOMMENDED** - Forces consistent rendering but reduces performance
-
----
-
-## Outdated BrowserProfile
-
-For backward compatibility, you can pass all the parameters from above to the `BrowserProfile` and then to the `Browser`.
-
-```python
-from browser_use import BrowserProfile
-profile = BrowserProfile(headless=False)
-browser = Browser(browser_profile=profile)
-```
-
-## Browser vs BrowserSession
-
-`Browser` is an alias for `BrowserSession` - they are exactly the same class:
-Use `Browser` for cleaner, more intuitive code.
diff --git a/docs/customize/browser/basics.mdx b/docs/customize/browser/basics.mdx
deleted file mode 100644
index 366295ee8..000000000
--- a/docs/customize/browser/basics.mdx
+++ /dev/null
@@ -1,27 +0,0 @@
----
-title: "Basics"
-description: ""
-icon: "play"
----
-
-
----
-
-```python
-from browser_use import Agent, Browser, ChatBrowserUse
-
-browser = Browser(
- headless=False, # Show browser window
- window_size={'width': 1000, 'height': 700}, # Set window size
-)
-
-agent = Agent(
- task='Search for Browser Use',
- browser=browser,
- llm=ChatBrowserUse(),
-)
-
-
-async def main():
- await agent.run()
-```
diff --git a/docs/customize/browser/real-browser.mdx b/docs/customize/browser/real-browser.mdx
deleted file mode 100644
index 4529c8e7f..000000000
--- a/docs/customize/browser/real-browser.mdx
+++ /dev/null
@@ -1,60 +0,0 @@
----
-title: "Real Browser"
-description: ""
-icon: "arrow-right-to-bracket"
----
-
-Connect your existing Chrome browser to preserve authentication.
-
-## Basic Example
-
-```python
-import asyncio
-from browser_use import Agent, Browser, ChatOpenAI
-
-# Connect to your existing Chrome browser
-browser = Browser(
- executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
- user_data_dir='~/Library/Application Support/Google/Chrome',
- profile_directory='Default',
-)
-
-agent = Agent(
- task='Visit https://duckduckgo.com and search for "browser-use founders"',
- browser=browser,
- llm=ChatOpenAI(model='gpt-4.1-mini'),
-)
-async def main():
- await agent.run()
-
-if __name__ == "__main__":
- asyncio.run(main())
-```
-
-> **Note:** You need to fully close chrome before running this example. Also, Google blocks this approach currently so we use DuckDuckGo instead.
-
-
-
-
-## How it Works
-
-1. **`executable_path`** - Path to your Chrome installation
-2. **`user_data_dir`** - Your Chrome profile folder (keeps cookies, extensions, bookmarks)
-3. **`profile_directory`** - Specific profile name (Default, Profile 1, etc.)
-
-
-## Platform Paths
-
-```python
-# macOS
-executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
-user_data_dir='~/Library/Application Support/Google/Chrome'
-
-# Windows
-executable_path='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
-user_data_dir='%LOCALAPPDATA%\\Google\\Chrome\\User Data'
-
-# Linux
-executable_path='/usr/bin/google-chrome'
-user_data_dir='~/.config/google-chrome'
-```
diff --git a/docs/customize/browser/remote.mdx b/docs/customize/browser/remote.mdx
deleted file mode 100644
index 8f71b972b..000000000
--- a/docs/customize/browser/remote.mdx
+++ /dev/null
@@ -1,85 +0,0 @@
----
-title: "Remote Browser"
-description: ""
-icon: "cloud"
-mode: "wide"
----
-
-
-### Browser-Use Cloud Browser or CDP URL
-
-The easiest way to use a cloud browser is with the built-in Browser-Use cloud service:
-
-```python
-from browser_use import Agent, Browser, ChatBrowserUse
-
-# Simple: Use Browser-Use cloud browser service
-browser = Browser(
- use_cloud=True, # Automatically provisions a cloud browser
-)
-
-# Advanced: Configure cloud browser parameters
-# Using this settings can bypass any captcha protection on any website
-browser = Browser(
- cloud_profile_id='your-profile-id', # Optional: specific browser profile
- cloud_proxy_country_code='us', # Optional: proxy location (us, uk, fr, it, jp, au, de, fi, ca, in)
- cloud_timeout=30, # Optional: session timeout in minutes (MAX free: 15min, paid: 240min)
-)
-
-# Or use a CDP URL from any cloud browser provider
-browser = Browser(
- cdp_url="http://remote-server:9222" # Get a CDP URL from any provider
-)
-
-agent = Agent(
- task="Your task here",
- llm=ChatBrowserUse(),
- browser=browser,
-)
-```
-
-**Prerequisites:**
-1. Get an API key from [cloud.browser-use.com](https://cloud.browser-use.com/new-api-key)
-2. Set BROWSER_USE_API_KEY environment variable
-
-**Cloud Browser Parameters:**
-- `cloud_profile_id`: UUID of a browser profile (optional, uses default if not specified)
-- `cloud_proxy_country_code`: Country code for proxy location - supports: us, uk, fr, it, jp, au, de, fi, ca, in
-- `cloud_timeout`: Session timeout in minutes (free users: max 15 min, paid users: max 240 min)
-
-**Benefits:**
-- ✅ No local browser setup required
-- ✅ Scalable and fast cloud infrastructure
-- ✅ Automatic provisioning and teardown
-- ✅ Built-in authentication handling
-- ✅ Optimized for browser automation
-- ✅ Global proxy support for geo-restricted content
-
-### Third-Party Cloud Browsers
-You can pass in a CDP URL from any remote browser
-
-
-### Proxy Connection
-
-```python
-
-from browser_use import Agent, Browser, ChatBrowserUse
-from browser_use.browser import ProxySettings
-
-browser = Browser(
- headless=False,
- proxy=ProxySettings(
- server="http://proxy-server:8080",
- username="proxy-user",
- password="proxy-pass"
- ),
- cdp_url="http://remote-server:9222"
-)
-
-
-agent = Agent(
- task="Your task here",
- llm=ChatBrowserUse(),
- browser=browser,
-)
-```
diff --git a/docs/customize/code-agent/all-parameters.mdx b/docs/customize/code-agent/all-parameters.mdx
deleted file mode 100644
index 242a6ae90..000000000
--- a/docs/customize/code-agent/all-parameters.mdx
+++ /dev/null
@@ -1,73 +0,0 @@
----
-title: "All Parameters"
-description: "Complete reference for all CodeAgent configuration options"
-icon: "sliders"
-mode: "wide"
----
-
-## CodeAgent Parameters
-
-### Core Settings
-- `task`: Task description string that defines what the agent should accomplish (required)
-- `llm`: LLM instance for code generation (required: ChatBrowserUse). If not provided, defaults to ChatBrowserUse()
-- `browser`: Browser session object for automation (optional, will be created if not provided)
-- `tools`: Registry of tools the agent can call (optional, creates default if not provided)
-- `max_steps` (default: `100`): Maximum number of execution steps before termination
-- `max_failures` (default: `8`): Maximum consecutive errors before termination
-- `max_validations` (default: `0`): Maximum number of times to run the validator agent
-
-### Vision & Processing
-- `use_vision` (default: `True`): Whether to include screenshots in LLM messages. `True` always includes screenshots, `False` never includes screenshots
-- `page_extraction_llm`: Separate LLM model for page content extraction. You can choose a small & fast model because it only needs to extract text from the page (default: same as `llm`)
-
-### File & Data Management
-- `file_system`: File system instance for file operations (optional, creates default if not provided)
-- `available_file_paths`: List of file paths the agent can access
-- `sensitive_data`: Dictionary of sensitive data to handle carefully
-
-### Advanced Options
-- `calculate_cost` (default: `False`): Calculate and track API costs
-
-### Backwards Compatibility
-- `controller`: Alias for `tools` for backwards compatibility
-- `browser_session`: Alias for `browser` for backwards compatibility (deprecated, use `browser`)
-
-## Return Value
-
-The `run()` method returns a `NotebookSession` object that contains:
-
-- `cells`: List of `CodeCell` objects representing each executed code cell
-- `id`: Unique session identifier
-- `current_execution_count`: Current execution count number
-- `namespace`: Dictionary containing the current namespace state with all variables
-
-### CodeCell Properties
-
-Each cell in `session.cells` has:
-
-- `id`: Unique cell identifier
-- `cell_type`: Type of cell ('code' or 'markdown')
-- `source`: The code that was executed
-- `output`: The output from code execution (if any)
-- `execution_count`: Execution order number
-- `status`: Execution status ('pending', 'running', 'success', or 'error')
-- `error`: Error message if execution failed
-- `browser_state`: Browser state after execution
-
-### Example
-
-```python
-session = await agent.run()
-
-# Access executed cells
-for cell in session.cells:
- print(f"Cell {cell.execution_count}: {cell.source}")
- if cell.error:
- print(f"Error: {cell.error}")
- elif cell.output:
- print(f"Output: {cell.output}")
-
-# Access variables from the namespace
-variables = session.namespace
-print(f"Variables: {list(variables.keys())}")
-```
diff --git a/docs/customize/code-agent/basics.mdx b/docs/customize/code-agent/basics.mdx
deleted file mode 100644
index 781028a38..000000000
--- a/docs/customize/code-agent/basics.mdx
+++ /dev/null
@@ -1,129 +0,0 @@
----
-title: "Basics"
-description: "Write Python code locally with browser automation"
-icon: "code"
----
-
-CodeAgent writes and executes Python code locally with browser automation capabilities. It's designed for repetitive data extraction tasks where the agent can write reusable functions.
-
-
-CodeAgent executes Python code on your local machine like Claude Code.
-
-
-## Quick Start
-
-
-```python
-import asyncio
-from browser_use import CodeAgent
-from dotenv import load_dotenv
-
-load_dotenv()
-
-async def main():
- task = "Extract all products from example.com and save to products.csv"
-
- agent = CodeAgent(task=task)
- await agent.run()
-
-asyncio.run(main())
-```
-
-```bash .env
-BROWSER_USE_API_KEY=your-api-key
-```
-
-
-CodeAgent currently only works with [ChatBrowserUse](/supported-models) which is optimized for this use case.
-Don't have one? We give you $10 to try it out [here](https://cloud.browser-use.com/new-api-key).
-
-
-## When to Use
-
-**Best for:**
-- Data extraction at scale (100s-1000s of items)
-- Repetitive interactions where functions can be reused
-- Tasks requiring data processing and file operations
-- Deterministic workflows you want to rerun
-
-**Performance:**
-- Best performance for data collection tasks
-- Slightly slower for one-off interactions vs standard Agent
-
-**Output:**
-- Generates Python code that can be rerun deterministically
-- Perfect for refining extraction logic
-
-
-The agent will write code blocks in different languages. This combines the power of js for browser interaction and python for data processing:
-```js extract_products
-(function(){
- return Array.from(document.querySelectorAll('.product')).map(p => ({
- name: p.querySelector('.name').textContent,
- price: p.querySelector('.price').textContent
- }))
-})()
-```
-```python
-import pandas as pd
-
-products = await evaluate(extract_products) # reuse other code blocks
-df = pd.DataFrame(products)
-df.to_csv('products.csv', index=False)
-```
-
-## Available Libraries
-
-The agent can use common Python libraries:
-
-- **Data processing:** `pandas`, `numpy`
-- **Web:** `requests`, `BeautifulSoup`
-- **File formats:** `csv`, `json`, `openpyxl` (Excel)
-- **Visualization:** `matplotlib`
-- **Utilities:** `tabulate`, `datetime`, `re`
-- and all which you install ...
-
-
-## Available Tools
-
-The agent has access to browser control functions:
-
-- `navigate(url)` - Navigate to a URL
-- `click(index)` - Click an element by index
-- `input(index, text)` - Type text into an input
-- `scroll(down, pages)` - Scroll the page
-- `upload_file(path)` - Upload a file
-- `evaluate(code, variables={})` - Execute JavaScript and return results
-- `done(text, success, files_to_display=[])` - Mark task complete
-
-## Exporting Sessions
-
-CodeAgent automatically saves all executed code and JavaScript blocks during your session. You can export your complete automation workflow for sharing, version control, or re-running later.
-
-### Quick Export
-
-```python
-from browser_use.code_use.notebook_export import export_to_ipynb, session_to_python_script
-
-# After running your agent
-await agent.run()
-
-# Export to Jupyter notebook
-notebook_path = export_to_ipynb(agent, "my_automation.ipynb")
-
-# Export to Python script
-script = session_to_python_script(agent)
-with open("my_automation.py", "w") as f:
- f.write(script)
-```
-
-### Export Formats
-
-- **Jupyter Notebook (.ipynb)**: Interactive development, sharing, documentation
-- **Python Script (.py)**: Production deployment, version control, automation
-
-Both formats include:
-- Setup code with browser initialization
-- JavaScript code blocks as Python variables
-- All executed Python cells with outputs
-- Ready-to-run automation workflows
diff --git a/docs/customize/code-agent/example-products.mdx b/docs/customize/code-agent/example-products.mdx
deleted file mode 100644
index afd602f66..000000000
--- a/docs/customize/code-agent/example-products.mdx
+++ /dev/null
@@ -1,59 +0,0 @@
----
-title: "Example: Extract Products"
-description: "Collect thousands of products and save to CSV"
-icon: "database"
----
-
-This example shows how to extract large amounts of product data from an e-commerce site and save it to files.
-
-## Use Case
-
-Extract 1000s of products from multiple categories with:
-- Product URLs
-- Names and descriptions
-- Original and sale prices
-- Discount percentages
-
-Save everything to a CSV file for further analysis.
-
-## Code
-
-```python
-import asyncio
-from browser_use.code_use import CodeAgent
-
-async def main():
- task = """
- Go to https://www.flipkart.com.
- Collect approximately 50 products from:
-
- 1. Books & Media - 15 products
- 2. Sports & Fitness - 15 products
- 3. Beauty & Personal Care - 10 products
-
- Save to products.csv
- """
-
- agent = CodeAgent(task=task)
- await agent.run()
-
-asyncio.run(main())
-```
-
-## How It Works
-
-1. **Agent navigates** to the e-commerce site
-2. **Writes JavaScript** to extract product data from each page
-3. **Loops through categories** collecting products
-4. **Stores in variables** that persist across steps
-5. **Saves to CSV** using pandas or csv module
-6. **Returns deterministic code** you can modify and rerun
-
-## Key Benefits
-
-- **Function reuse:** Extraction code is written once, used many times
-- **Scale:** Easily collect 100s or 1000s of items
-- **Deterministic:** The generated Python code can be saved and rerun
-- **Data processing:** Built-in pandas support for cleaning and transforming data
-
-[View full example on GitHub →](https://github.com/browser-use/browser-use/blob/main/examples/code_agent/extract_products.py)
diff --git a/docs/customize/code-agent/exporting.mdx b/docs/customize/code-agent/exporting.mdx
deleted file mode 100644
index 1441dde45..000000000
--- a/docs/customize/code-agent/exporting.mdx
+++ /dev/null
@@ -1,129 +0,0 @@
----
-title: "Exporting Sessions"
-description: "Save and share your CodeAgent sessions as Jupyter notebooks or Python scripts"
-icon: "download"
----
-
-CodeAgent automatically saves all executed code and JavaScript blocks during your session. You can export your complete automation workflow in multiple formats for sharing, version control, or re-running later.
-
-## Quick Start
-
-```python
-import asyncio
-from browser_use import CodeAgent, ChatBrowserUse
-from browser_use.code_use.notebook_export import export_to_ipynb, session_to_python_script
-
-async def main():
- agent = CodeAgent(
- task="Extract product data from https://example.com",
- llm=ChatBrowserUse(),
- max_steps=10
- )
-
- # Run your automation
- await agent.run()
-
- # Export to Jupyter notebook
- notebook_path = export_to_ipynb(agent, "product_scraping.ipynb")
-
- # Export to Python script
- python_script = session_to_python_script(agent)
- with open("product_scraping.py", "w") as f:
- f.write(python_script)
-
-if __name__ == '__main__':
- asyncio.run(main())
-```
-
-## Export Formats
-
-### Jupyter Notebook (.ipynb)
-
-**Contains:**
-- Setup cell with browser initialization and imports
-- JavaScript code blocks as Python string variables
-- All executed Python cells with outputs and errors
-- Browser state snapshots
-
-**Structure:**
-```python
-# Cell 1: Setup
-import asyncio
-import json
-from browser_use import BrowserSession
-from browser_use.code_use import create_namespace
-
-browser = BrowserSession()
-await browser.start()
-namespace = create_namespace(browser)
-globals().update(namespace)
-
-# Cell 2: JavaScript variables
-extract_products = """(function(){
- return Array.from(document.querySelectorAll('.product')).map(product => ({
- name: product.querySelector('.name')?.textContent,
- price: product.querySelector('.price')?.textContent
- }));
-})()"""
-
-# Remaining cells: Python execution
-await navigate('https://example.com')
-
-...
-
-products = await evaluate(extract_products)
-print(f"Found {len(products)} products")
-```
-
-### Python Script (.py)
-
-**Best for:** Production deployment, version control, automation
-
-**Contains:**
-- Complete runnable script with all imports
-- JavaScript code blocks as Python string variables
-- All executed code with proper indentation
-- Ready to run with `python script.py`
-
-**Structure:**
-```python
-# Generated from browser-use code-use session
-import asyncio
-import json
-from browser_use import BrowserSession
-from browser_use.code_use import create_namespace
-
-async def main():
- # Initialize browser and namespace
- browser = BrowserSession()
- await browser.start()
-
- # Create namespace with all browser control functions
- namespace = create_namespace(browser)
-
- # Extract functions from namespace for direct access
- navigate = namespace["navigate"]
- click = namespace["click"]
- evaluate = namespace["evaluate"]
- # ... other functions
-
- # JavaScript Code Block: extract_products
- extract_products = """(function(){
- return Array.from(document.querySelectorAll('.product')).map(product => ({
- name: product.querySelector('.name')?.textContent,
- price: product.querySelector('.price')?.textContent
- }));
- })()"""
-
- # Cell 1
- await navigate('https://example.com')
-
- # Cell 2
- products = await evaluate(extract_products)
- print(f"Found {len(products)} products")
-
- await browser.stop()
-
-if __name__ == '__main__':
- asyncio.run(main())
-```
diff --git a/docs/customize/code-agent/output-format.mdx b/docs/customize/code-agent/output-format.mdx
deleted file mode 100644
index 9dae63d78..000000000
--- a/docs/customize/code-agent/output-format.mdx
+++ /dev/null
@@ -1,103 +0,0 @@
----
-title: "Output Format"
-description: "Understanding CodeAgent return values and how to access execution history"
-icon: "arrow-right-to-bracket"
-mode: "wide"
----
-
-## NotebookSession
-
-The `run()` method returns a `NotebookSession` object containing all executed code cells and their results:
-
-```python
-session = await agent.run()
-
-# Access basic properties
-session.id # Unique session identifier
-session.cells # List of CodeCell objects
-session.current_execution_count # Total number of executed cells
-session.namespace # Dictionary with all variables from execution
-
-# Helper methods
-session.get_cell(cell_id) # Get a specific cell by ID
-session.get_latest_cell() # Get the most recently executed cell
-```
-
-## CodeCell Properties
-
-Each cell in `session.cells` represents one executed code block:
-
-```python
-for cell in session.cells:
- cell.id # Unique cell identifier
- cell.cell_type # 'code' or 'markdown'
- cell.source # The code that was executed
- cell.output # Output from code execution (if any)
- cell.execution_count # Execution order number
- cell.status # 'pending', 'running', 'success', or 'error'
- cell.error # Error message if execution failed
- cell.browser_state # Browser state after execution
-```
-
-## Accessing Results
-
-### Basic Usage
-
-```python
-session = await agent.run()
-
-# Iterate through all executed cells
-for cell in session.cells:
- print(f"Cell {cell.execution_count}:")
- print(f" Code: {cell.source}")
- if cell.error:
- print(f" Error: {cell.error}")
- elif cell.output:
- print(f" Output: {cell.output}")
- print(f" Status: {cell.status}")
-
-# Get the last cell
-last_cell = session.get_latest_cell()
-if last_cell:
- print(f"Last output: {last_cell.output}")
-
-# Access variables from the execution namespace
-products = session.namespace.get('products', [])
-print(f"Extracted {len(products)} products")
-```
-
-### Checking Task Completion
-
-When the agent calls `done()`, the result is stored in the namespace:
-
-```python
-session = await agent.run()
-
-# Check if task was completed
-task_done = session.namespace.get('_task_done', False)
-task_result = session.namespace.get('_task_result')
-task_success = session.namespace.get('_task_success')
-
-if task_done:
- print(f"Task completed: {task_success}")
- print(f"Result: {task_result}")
-```
-
-### Getting All Outputs
-
-```python
-session = await agent.run()
-
-# Get all outputs (excluding errors)
-outputs = [cell.output for cell in session.cells if cell.output]
-
-# Get all errors
-errors = [cell.error for cell in session.cells if cell.error]
-
-# Get successful cells only
-successful_cells = [cell for cell in session.cells if cell.status == 'success']
-```
-
-## Data Models
-
-See the complete data model definitions in the [CodeAgent views source code](https://github.com/browser-use/browser-use/blob/main/browser_use/code_use/views.py).
diff --git a/docs/customize/hooks.mdx b/docs/customize/hooks.mdx
deleted file mode 100644
index de7e994b1..000000000
--- a/docs/customize/hooks.mdx
+++ /dev/null
@@ -1,118 +0,0 @@
----
-title: "Lifecycle Hooks"
-description: "Customize agent behavior with lifecycle hooks"
-icon: "Wrench"
-mode: "wide"
----
-
-Browser-Use provides lifecycle hooks that allow you to execute custom code at specific points during the agent's execution.
-Hook functions can be used to read and modify agent state while running, implement custom logic, change configuration, integrate the Agent with external applications.
-
-## Available Hooks
-
-Currently, Browser-Use provides the following hooks:
-
-| Hook | Description | When it's called |
-| --------------- | -------------------------------------------- | ------------------------------------------------------------------------------------------------- |
-| `on_step_start` | Executed at the beginning of each agent step | Before the agent processes the current state and decides on the next action |
-| `on_step_end` | Executed at the end of each agent step | After the agent has executed all the actions for the current step, before it starts the next step |
-
-```python
-await agent.run(on_step_start=..., on_step_end=...)
-```
-
-Each hook should be an `async` callable function that accepts the `agent` instance as its only parameter.
-
-### Basic Example
-
-```python
-import asyncio
-from pathlib import Path
-
-from browser_use import Agent, ChatOpenAI
-from browser_use.browser.events import ScreenshotEvent
-
-
-async def my_step_hook(agent: Agent):
- # inside a hook you can access all the state and methods under the Agent object:
- # agent.settings, agent.state, agent.task
- # agent.tools, agent.llm, agent.browser_session
- # agent.pause(), agent.resume(), agent.add_new_task(...), etc.
-
- # You also have direct access to the browser state
- state = await agent.browser_session.get_browser_state_summary()
-
- current_url = state.url
- visit_log = agent.history.urls()
- previous_url = visit_log[-2] if len(visit_log) >= 2 else None
- print(f'Agent was last on URL: {previous_url} and is now on {current_url}')
- cdp_session = await agent.browser_session.get_or_create_cdp_session()
-
- # Example: Get page HTML content
- doc = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id)
- html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
- params={'nodeId': doc['root']['nodeId']}, session_id=cdp_session.session_id
- )
- page_html = html_result['outerHTML']
-
- # Example: Take a screenshot using the event system
- screenshot_event = agent.browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False))
- await screenshot_event
- result = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True)
-
- # Example: pause agent execution and resume it based on some custom code
- if '/finished' in current_url:
- agent.pause()
- Path('result.txt').write_text(page_html)
- input('Saved "finished" page content to result.txt, press [Enter] to resume...')
- agent.resume()
-
-
-async def main():
- agent = Agent(
- task='Search for the latest news about AI',
- llm=ChatOpenAI(model='gpt-5-mini'),
- )
-
- await agent.run(
- on_step_start=my_step_hook,
- # on_step_end=...
- max_steps=10,
- )
-
-
-if __name__ == '__main__':
- asyncio.run(main())
-```
-
-## Data Available in Hooks
-
-When working with agent hooks, you have access to the entire `Agent` instance. Here are some useful data points you can access:
-
-- `agent.task` lets you see what the main task is, `agent.add_new_task(...)` lets you queue up a new one
-- `agent.tools` give access to the `Tools()` object and `Registry()` containing the available actions
- - `agent.tools.registry.execute_action('click', {'index': 123}, browser_session=agent.browser_session)`
-- `agent.sensitive_data` contains the sensitive data dict, which can be updated in-place to add/remove/modify items
-- `agent.settings` contains all the configuration options passed to the `Agent(...)` at init time
-- `agent.llm` gives direct access to the main LLM object (e.g. `ChatOpenAI`)
-- `agent.state` gives access to lots of internal state, including agent thoughts, outputs, actions, etc.
-- `agent.history` gives access to historical data from the agent's execution:
- - `agent.history.model_thoughts()`: Reasoning from Browser Use's model.
- - `agent.history.model_outputs()`: Raw outputs from the Browser Use's model.
- - `agent.history.model_actions()`: Actions taken by the agent
- - `agent.history.extracted_content()`: Content extracted from web pages
- - `agent.history.urls()`: URLs visited by the agent
-- `agent.browser_session` gives direct access to the `BrowserSession` and CDP interface
- - `agent.browser_session.agent_focus_target_id`: Get the current target ID the agent is focused on
- - `agent.browser_session.get_or_create_cdp_session()`: Get the current CDP session for browser interaction
- - `agent.browser_session.get_tabs()`: Get all tabs currently open
- - `agent.browser_session.get_current_page_url()`: Get the URL of the current active tab
- - `agent.browser_session.get_current_page_title()`: Get the title of the current active tab
-
-## Tips for Using Hooks
-
-- **Avoid blocking operations**: Since hooks run in the same execution thread as the agent, keep them efficient and avoid blocking operations.
-- **Use custom tools instead**: hooks are fairly advanced, most things can be implemented with [custom tools](/customize/tools/basics) instead
-- **Increase step_timeout**: If your hook is doing something that takes a long time, you can increase the `step_timeout` parameter in the `Agent(...)` constructor.
-
----
diff --git a/docs/customize/integrations/docs-mcp.mdx b/docs/customize/integrations/docs-mcp.mdx
deleted file mode 100644
index eec5b9406..000000000
--- a/docs/customize/integrations/docs-mcp.mdx
+++ /dev/null
@@ -1,93 +0,0 @@
----
-title: "Documentation MCP"
-description: "Add browser-use documentation context to Claude Code and other MCP clients"
-icon: "book"
-mode: "wide"
----
-
-## Overview
-
-The browser-use documentation MCP server provides read-only access to browser-use documentation for Claude Code and other MCP-compatible clients. This gives AI assistants deep context about the browser-use library when helping you write code.
-
- Looking to give an assistant browser-use capabilities? Check out our Browser Automation MCP.
-
-## Quick Start
-
-Add the documentation server to your coding agent:
-
-
-
-```bash
-claude mcp add --transport http browser-use https://docs.browser-use.com/mcp
-```
-
-
-Add to `~/.cursor/mcp.json`:
-
-```json
-{
- "mcpServers": {
- "browser-use-docs": {
- "url": "https://docs.browser-use.com/mcp"
- }
- }
-}
-```
-
-
-Add to `~/.codex/config.toml`:
-
-```toml
-[mcp_servers.browser-use-docs]
-url = "https://docs.browser-use.com/mcp"
-```
-
-
-Add to `~/.codeium/windsurf/mcp_config.json`:
-
-```json
-{
- "mcpServers": {
- "browser-use-docs": {
- "serverUrl": "https://docs.browser-use.com/mcp"
- }
- }
-}
-```
-
-
-
-This enables your AI coding assistant to access browser-use documentation when answering questions or helping with implementation.
-
-## What This Provides
-
-The documentation MCP server gives AI assistants access to:
-
-- API reference and usage patterns
-- Configuration options and parameters
-- Best practices and examples
-- Troubleshooting guides
-- Architecture explanations
-
-**Example interactions:**
-```
-"How do I configure custom tools in browser-use?"
-
-"What are the available agent parameters?"
-
-"Show me how to use cloud browsers."
-```
-
-Claude Code can now answer these questions using up-to-date documentation context.
-
-## How It Works
-
-The MCP server provides a read-only documentation interface:
-- Serves browser-use documentation over HTTP
-- No browser automation capabilities (see [MCP Server](/customize/integrations/mcp-server) for that)
-- Lightweight and always available
-- No API keys or configuration needed
-
-## Next Steps
-
-- Start coding with [Agent Basics](/customize/agent/basics)
diff --git a/docs/customize/integrations/mcp-server.mdx b/docs/customize/integrations/mcp-server.mdx
deleted file mode 100644
index 43ab71c8c..000000000
--- a/docs/customize/integrations/mcp-server.mdx
+++ /dev/null
@@ -1,375 +0,0 @@
----
-title: MCP Server
-description: Connect AI models to Browser Use through the Model Context Protocol
----
-
-Browser Use provides a hosted **Model Context Protocol (MCP)** server that enables AI assistants to control browser automation. Works with any HTTP-based MCP client, including Claude Code.
-
-**MCP Server URL:** `https://api.browser-use.com/mcp`
-
-This is an **HTTP-based MCP server** designed for cloud integrations and remote access. If you need a local stdio-based MCP server for Claude Desktop, use the free open-source version: `uvx browser-use --mcp`
-
-## Quick Setup
-
-### 1. Get API Key
-Get your API key from the [Browser Use Dashboard](https://cloud.browser-use.com)
-
-### 2. Connect Your AI
-
-
-
-```bash
-claude mcp add --transport http browser-use https://api.browser-use.com/mcp
-```
-
-
-Add to your Claude Desktop config file:
-
-**macOS:** `~/Library/Application Support/Claude/claude_desktop_config.json`
-**Windows:** `%APPDATA%\Claude\claude_desktop_config.json`
-
-```json
-{
- "mcpServers": {
- "browser-use": {
- "command": "npx",
- "args": [
- "mcp-remote",
- "https://api.browser-use.com/mcp",
- "--header",
- "X-Browser-Use-API-Key: your-api-key"
- ]
- }
- }
-}
-```
-
-Restart Claude Desktop after saving.
-
-
-Add to `~/.cursor/mcp.json`:
-
-```json
-{
- "mcpServers": {
- "browser-use": {
- "command": "npx",
- "args": [
- "mcp-remote",
- "https://api.browser-use.com/mcp",
- "--header",
- "X-Browser-Use-API-Key: your-api-key"
- ]
- }
- }
-}
-```
-
-
-Add to `~/.codeium/windsurf/mcp_config.json`:
-
-```json
-{
- "mcpServers": {
- "browser-use": {
- "serverUrl": "https://api.browser-use.com/mcp",
- "headers": {
- "X-Browser-Use-API-Key": "your-api-key"
- }
- }
- }
-}
-```
-
-
-**Step 1: Register an OAuth client**
-
-Call the dynamic client registration endpoint with ChatGPT's redirect URI:
-
-```bash
-curl -X POST https://api.browser-use.com/oauth/register \
- -H "Content-Type: application/json" \
- -d '{
- "client_name": "ChatGPT Integration",
- "redirect_uris": ["https://chatgpt.com/connector_platform_oauth_redirect"]
- }'
-```
-
-Save the `client_id` from the response (43-character random string).
-
-**Step 2: Configure ChatGPT**
-
-In ChatGPT, add a custom MCP connector:
-- **MCP Server URL**: `https://api.browser-use.com/mcp/chatgpt`
-- **Client ID**: Paste the `client_id` from step 1
-
-**Step 3: Authorize**
-
-ChatGPT will redirect you to Browser Use's authorization page. Sign in and grant permission.
-
-**Note:** ChatGPT uses OAuth 2.1 authentication instead of API keys. You only need to register your client once.
-
-
-
-## Available Tools
-
-The MCP server provides three tools:
-
-### `browser_task`
-Creates and runs a browser automation task.
-- **task** (required): What you want the browser to do
-- **max_steps** (optional): Max actions to take (1-10, default: 8)
-- **profile_id** (optional): UUID of the cloud profile to use for persistent authentication
-
-### `list_browser_profiles`
-Lists all available cloud browser profiles for the authenticated project. Profiles store persistent authentication (cookies, sessions) for websites requiring login.
-
-### `monitor_task`
-Checks the current status and progress of a browser automation task. Returns immediately with a snapshot of the task state.
-- **task_id** (required): UUID of the task to monitor (returned by browser_task)
-
-## Example Usage
-
-Once connected, ask your AI to perform web tasks:
-
-> "Search Google for the latest iPhone reviews and summarize the top 3 results"
-
-> "Go to Hacker News and get me the titles of the top 5 posts"
-
-> "Fill out the contact form on example.com with my information"
-
-The AI will use the browser tools automatically to complete these tasks.
-
-## Smart Features
-
-### Cloud Profiles for Authentication
-Use cloud browser profiles to maintain persistent login sessions across tasks. Profiles store cookies and authentication state for:
-- Social media (X/Twitter, LinkedIn, Facebook)
-- Email (Gmail, Outlook)
-- Online banking and shopping sites
-- Any website requiring login
-
-List available profiles with `list_browser_profiles`, then pass the `profile_id` to `browser_task`.
-
-### Real-time Task Monitoring
-Use `monitor_task` to check task progress while it's running. The tool returns immediately with the current status, latest step details, and agent reasoning. Call it repeatedly to track progress live.
-
-### Conversational Progress Summaries
-When you monitor tasks, the AI automatically interprets step data into natural language updates, explaining what the browser has completed and what it's currently working on.
-
-## Troubleshooting
-
-**Connection issues?**
-- Verify your API key is correct
-- Check you're using the right headers
-
-**Task taking too long?**
-- Check the live_url to see progress
-- Increase max_steps for complex tasks (max: 10)
-- Use clearer, more specific instructions
-
-**Need help?**
-Check our [Cloud Documentation](https://docs.cloud.browser-use.com) for detailed specifications.
-
----
-
-## Local Self-Hosted Alternative
-
-For users who want a free, self-hosted option, browser-use can run as a local MCP server on your machine. This requires your own OpenAI or Anthropic API keys but provides direct, low-level control over browser automation.
-
-### Quick Start
-
-The local MCP server runs as a stdio-based process on your machine. This is the **free, open-source option** but requires your own LLM API keys.
-
-#### Start MCP Server Manually
-
-```bash
-uvx --from 'browser-use[cli]' browser-use --mcp
-```
-
-The server will start in stdio mode, ready to accept MCP connections.
-
-#### Claude Desktop Integration
-
-The most common use case is integrating with Claude Desktop. Add this configuration to your Claude Desktop config file:
-
-**macOS:** `~/Library/Application Support/Claude/claude_desktop_config.json`
-
-```json
-{
- "mcpServers": {
- "browser-use": {
- "command": "/Users/your-username/.local/bin/uvx",
- "args": ["--from", "browser-use[cli]", "browser-use", "--mcp"],
- "env": {
- "OPENAI_API_KEY": "your-openai-api-key-here"
- }
- }
- }
-}
-```
-
-**Windows:** `%APPDATA%\Claude\claude_desktop_config.json`
-
-```json
-{
- "mcpServers": {
- "browser-use": {
- "command": "uvx",
- "args": ["--from", "browser-use[cli]", "browser-use", "--mcp"],
- "env": {
- "OPENAI_API_KEY": "your-openai-api-key-here"
- }
- }
- }
-}
-```
-
-
-**macOS/Linux PATH Issue:** Claude Desktop may not find `uvx` in your PATH. Use the full path to `uvx` instead:
-- Run `which uvx` in your terminal to find the location (usually `/Users/username/.local/bin/uvx` or `~/.local/bin/uvx`)
-- Replace `"command": "uvx"` with the full path, e.g., `"command": "/Users/your-username/.local/bin/uvx"`
-- Replace `your-username` with your actual username
-
-**CLI Extras Required:** The `--from browser-use[cli]` flag installs the CLI extras needed for MCP server support.
-
-
-#### Environment Variables
-
-You can configure browser-use through environment variables:
-
-- `OPENAI_API_KEY` - Your OpenAI API key (required)
-- `ANTHROPIC_API_KEY` - Your Anthropic API key (alternative to OpenAI)
-- `BROWSER_USE_HEADLESS` - Set to `false` to show browser window
-- `BROWSER_USE_DISABLE_SECURITY` - Set to `true` to disable browser security features
-
-### Available Tools
-
-The local MCP server exposes these low-level browser automation tools for direct control:
-
-#### Autonomous Agent Tools
-- **`retry_with_browser_use_agent`** - Run a complete browser automation task with an AI agent (use as last resort when direct control fails)
-
-#### Direct Browser Control
-- **`browser_navigate`** - Navigate to a URL
-- **`browser_click`** - Click on an element by index
-- **`browser_type`** - Type text into an element
-- **`browser_get_state`** - Get current page state and interactive elements
-- **`browser_scroll`** - Scroll the page
-- **`browser_go_back`** - Go back in browser history
-
-#### Tab Management
-- **`browser_list_tabs`** - List all open browser tabs
-- **`browser_switch_tab`** - Switch to a specific tab
-- **`browser_close_tab`** - Close a tab
-
-#### Content Extraction
-- **`browser_extract_content`** - Extract structured content from the current page
-
-#### Session Management
-- **`browser_list_sessions`** - List all active browser sessions with details
-- **`browser_close_session`** - Close a specific browser session by ID
-- **`browser_close_all`** - Close all active browser sessions
-
-### Example Usage
-
-Once configured with Claude Desktop, you can ask Claude to perform browser automation tasks:
-
-```
-"Please navigate to example.com and take a screenshot"
-
-"Search for 'browser automation' on Google and summarize the first 3 results"
-
-"Go to GitHub, find the browser-use repository, and tell me about the latest release"
-```
-
-Claude will use the MCP server to execute these tasks through browser-use.
-
-### Programmatic Usage
-
-You can also connect to the MCP server programmatically:
-
-```python
-import asyncio
-from mcp import ClientSession, StdioServerParameters
-from mcp.client.stdio import stdio_client
-
-async def use_browser_mcp():
- # Connect to browser-use MCP server
- server_params = StdioServerParameters(
- command="uvx",
- args=["--from", "browser-use[cli]", "browser-use", "--mcp"]
- )
-
- async with stdio_client(server_params) as (read, write):
- async with ClientSession(read, write) as session:
- await session.initialize()
-
- # Navigate to a website
- result = await session.call_tool(
- "browser_navigate",
- arguments={"url": "https://example.com"}
- )
- print(result.content[0].text)
-
- # Get page state
- result = await session.call_tool(
- "browser_get_state",
- arguments={"include_screenshot": True}
- )
- print("Page state retrieved!")
-
-asyncio.run(use_browser_mcp())
-```
-
-### Troubleshooting
-
-#### Common Issues
-
-**"CLI addon is not installed" Error**
-Make sure you're using `--from 'browser-use[cli]'` in your uvx command:
-```bash
-uvx --from 'browser-use[cli]' browser-use --mcp
-```
-
-**"spawn uvx ENOENT" Error (macOS/Linux)**
-Claude Desktop can't find `uvx` in its PATH. Use the full path in your config:
-- Run `which uvx` in terminal to find the location
-- Update your config to use the full path (e.g., `/Users/your-username/.local/bin/uvx`)
-
-**Browser doesn't start**
-- Check that you have Chrome/Chromium installed
-- Try setting `BROWSER_USE_HEADLESS=false` to see browser window
-- Ensure no other browser instances are using the same profile
-
-**API Key Issues**
-- Verify your `OPENAI_API_KEY` is set correctly
-- Check API key permissions and billing status
-- Try using `ANTHROPIC_API_KEY` as an alternative
-
-**Connection Issues in Claude Desktop**
-- Restart Claude Desktop after config changes
-- Check the config file syntax is valid JSON
-- Verify the file path is correct for your OS
-- Check logs at `~/Library/Logs/Claude/` (macOS) or `%APPDATA%\Claude\Logs\` (Windows)
-
-#### Debug Mode
-
-Enable debug logging by setting:
-```bash
-export BROWSER_USE_LOGGING_LEVEL=DEBUG
-uvx --from 'browser-use[cli]' browser-use --mcp
-```
-
-### Security Considerations
-
-- The MCP server has access to your browser and file system
-- Only connect trusted MCP clients
-- Be cautious with sensitive websites and data
-- Consider running in a sandboxed environment for untrusted automation
-
-### Next Steps
-
-- Explore the [examples directory](https://github.com/browser-use/browser-use/tree/main/examples/mcp) for more usage patterns
-- Check out [MCP documentation](https://modelcontextprotocol.io/) to learn more about the protocol
-- Join our [Discord](https://link.browser-use.com/discord) for support and discussions
diff --git a/docs/customize/sandbox/all-parameters.mdx b/docs/customize/sandbox/all-parameters.mdx
deleted file mode 100644
index 4c7c618c0..000000000
--- a/docs/customize/sandbox/all-parameters.mdx
+++ /dev/null
@@ -1,32 +0,0 @@
----
-title: "All Parameters"
-description: "Sandbox configuration reference"
-icon: "sliders"
----
-
-## Reference
-
-| Parameter | Type | Description | Default |
-|-----------|------|-------------|---------|
-| `BROWSER_USE_API_KEY` | `str` | API key (or env var) | Required |
-| `cloud_profile_id` | `str` | Browser profile UUID | `None` |
-| `cloud_proxy_country_code` | `str` | us, uk, fr, it, jp, au, de, fi, ca, in | `None` |
-| `cloud_timeout` | `int` | Minutes (max: 15 free, 240 paid) | `None` |
-| `on_browser_created` | `Callable` | Live URL callback | `None` |
-| `on_log` | `Callable` | Log event callback | `None` |
-| `on_result` | `Callable` | Success callback | `None` |
-| `on_error` | `Callable` | Error callback | `None` |
-
-## Example
-
-```python
-@sandbox(
- cloud_profile_id='550e8400-e29b-41d4-a716-446655440000',
- cloud_proxy_country_code='us',
- cloud_timeout=60,
- on_browser_created=lambda data: print(f'Live: {data.live_url}'),
-)
-async def task(browser: Browser):
- agent = Agent(task="your task", browser=browser, llm=ChatBrowserUse())
- await agent.run()
-```
diff --git a/docs/customize/sandbox/events.mdx b/docs/customize/sandbox/events.mdx
deleted file mode 100644
index d5bba57a6..000000000
--- a/docs/customize/sandbox/events.mdx
+++ /dev/null
@@ -1,31 +0,0 @@
----
-title: "Events"
-description: "Monitor execution with callbacks"
-icon: "bell"
----
-
-## Live Browser View
-
-```python
-@sandbox(on_browser_created=lambda data: print(f'👁️ {data.live_url}'))
-async def task(browser: Browser):
- agent = Agent(task="your task", browser=browser, llm=ChatBrowserUse())
- await agent.run()
-```
-
-## All Events
-
-```python
-from browser_use.sandbox import BrowserCreatedData, LogData, ResultData, ErrorData
-
-@sandbox(
- on_browser_created=lambda data: print(f'Live: {data.live_url}'),
- on_log=lambda log: print(f'{log.level}: {log.message}'),
- on_result=lambda result: print('Done!'),
- on_error=lambda error: print(f'Error: {error.error}'),
-)
-async def task(browser: Browser):
- # Your code
-```
-
-All callbacks can be sync or async.
diff --git a/docs/customize/sandbox/quickstart.mdx b/docs/customize/sandbox/quickstart.mdx
deleted file mode 100644
index 63121aea3..000000000
--- a/docs/customize/sandbox/quickstart.mdx
+++ /dev/null
@@ -1,51 +0,0 @@
----
-title: "Quickstart"
-description: "Run browser automation in the cloud"
-icon: "rocket"
----
-
-Sandboxes are the **easiest way to run Browser-Use in production**. We handle agents, browsers, persistence, auth, cookies, and LLMs. It's also the **fastest way to deploy** - the agent runs right next to the browser, so latency is minimal.
-
-
-Get your API key at [cloud.browser-use.com/new-api-key](https://cloud.browser-use.com/new-api-key) - new signups get $10 free.
-
-
-## Basic Example
-
-Just wrap your function with `@sandbox()`:
-
-```python
-from browser_use import Browser, sandbox, ChatBrowserUse
-from browser_use.agent.service import Agent
-
-@sandbox()
-async def my_task(browser: Browser):
- agent = Agent(task="Find the top HN post", browser=browser, llm=ChatBrowserUse())
- await agent.run()
-
-await my_task()
-```
-
-## With Cloud Parameters
-
-```python
-@sandbox(
- cloud_profile_id='your-profile-id', # Use saved cookies/auth
- cloud_proxy_country_code='us', # Bypass captchas, cloudflare, geo-restrictions
- cloud_timeout=60, # Max session time (minutes)
-)
-async def task(browser: Browser, url: str):
- agent = Agent(task=f"Visit {url}", browser=browser, llm=ChatBrowserUse())
- await agent.run()
-
-await task(url="https://example.com")
-```
-
-**What each does:**
-- `cloud_profile_id` - Use saved cookies/authentication from your cloud profile
-- `cloud_proxy_country_code` - Route through country-specific proxy for stealth (bypass captchas, Cloudflare, geo-blocks)
-- `cloud_timeout` - Maximum time browser stays open in minutes
-
----
-
-For more parameters and events, see the other tabs in this section.
diff --git a/docs/customize/skills/basics.mdx b/docs/customize/skills/basics.mdx
deleted file mode 100644
index 65a5f7bc9..000000000
--- a/docs/customize/skills/basics.mdx
+++ /dev/null
@@ -1,88 +0,0 @@
----
-title: "Basics"
-description: "Skills are your API for anything. Describe what you need in plain text, and get a production-ready API endpoint you can call repeatedly."
-icon: "sparkles"
----
-
-To learn more visit [Skills - Concepts](https://docs.cloud.browser-use.com/concepts/skills).
-
-
-## Quick Example
-
-
-Load `['*']` for all skills or specific skill IDs from [cloud.browser-use.com/skills](https://cloud.browser-use.com/skills).
-
-```python
-from browser_use import Agent, ChatBrowserUse
-
-agent = Agent(
- task='Your task',
- skills=['skill-uuid-1', 'skill-uuid-2'], # Specific skills (recommended)
- # or
- # skills=['*'], # All skills
- llm=ChatBrowserUse()
-)
-
-await agent.run()
-```
-
-
-Be careful using `*`. Each skill will contribute around 200 tokens to the prompt.
-
-
-and don't forget to add your API key to `.env`:
-
-```bash .env
-BROWSER_USE_API_KEY=your-api-key
-```
-
-
-Get your API key on [cloud](https://cloud.browser-use.com/new-api-key) - new signups get \$10 free.
-
-
-## Cookie Handling
-
-Cookies are automatically injected from your browser:
-
-```python
-agent = Agent(
- task='Post a tweet saying "Hello World"',
- skills=['tweet-poster-skill-id'],
- llm=ChatBrowserUse()
-)
-
-# Agent navigates to twitter.com, logs in if needed,
-# extracts cookies, and passes them to the skill automatically
-await agent.run()
-```
-
-If cookies are missing, the LLM sees which cookies are needed and navigates to obtain them.
-
----
-
-## Full Example
-
-```python
-from browser_use import Agent, ChatBrowserUse
-from dotenv import load_dotenv
-import asyncio
-
-load_dotenv()
-
-async def main():
- agent = Agent(
- task='Analyze TikTak and Instegram profiles',
- skills=[
- 'a582eb44-e4e2-4c55-acc2-2f5a875e35e9', # TikTak Profile Scraper
- 'f8d91c2a-3b4e-4f7d-9a1e-6c8e2d3f4a5b', # Instegram Profile Scraper
- ],
- llm=ChatBrowserUse()
- )
-
- await agent.run()
- await agent.close()
-
-asyncio.run(main())
-```
-
-Browse and create skills at [cloud.browser-use.com/skills](https://cloud.browser-use.com/skills).
diff --git a/docs/customize/tools/add.mdx b/docs/customize/tools/add.mdx
deleted file mode 100644
index 4e78f8a56..000000000
--- a/docs/customize/tools/add.mdx
+++ /dev/null
@@ -1,174 +0,0 @@
----
-title: "Add Tools"
-description: ""
-icon: "plus"
-mode: "wide"
----
-
-
-Examples:
-- deterministic clicks
-- file handling
-- calling APIs
-- human-in-the-loop
-- browser interactions
-- calling LLMs
-- get 2fa codes
-- send emails
-- Playwright integration (see [GitHub example](https://github.com/browser-use/browser-use/blob/main/examples/browser/playwright_integration.py))
-- ...
-
-Simply add `@tools.action(...)` to your function.
-
-```python
-from browser_use import Tools, Agent, ActionResult
-
-tools = Tools()
-
-@tools.action(description='Ask human for help with a question')
-async def ask_human(question: str) -> ActionResult:
- answer = input(f'{question} > ')
- return ActionResult(extracted_content=f'The human responded with: {answer}')
-```
-
-```python
-agent = Agent(task='...', llm=llm, tools=tools)
-```
-
-- **`description`** *(required)* - What the tool does, the LLM uses this to decide when to call it.
-- **`allowed_domains`** - List of domains where tool can run (e.g. `['*.example.com']`), defaults to all domains
-
-The Agent fills your function parameters based on their names, type hints, & defaults.
-
-
-**Common Pitfall**: Parameter names must match exactly! Use `browser_session: BrowserSession` (not `browser: Browser`).
-The agent injects special parameters by **name matching**, so using incorrect names will cause your tool to fail silently.
-See [Available Objects](#available-objects) below for the correct parameter names.
-
-
-
-## Available Objects
-
-Your function has access to these objects:
-
-- **`browser_session: BrowserSession`** - Current browser session for CDP access
-- **`cdp_client`** - Direct Chrome DevTools Protocol client
-- **`page_extraction_llm: BaseChatModel`** - The LLM you pass into agent. This can be used to do a custom llm call here.
-- **`file_system: FileSystem`** - File system access
-- **`available_file_paths: list[str]`** - Available files for upload/processing
-- **`has_sensitive_data: bool`** - Whether action contains sensitive data
-
-
-## Browser Interaction Examples
-
-You can use `browser_session` to directly interact with page elements using CSS selectors:
-
-```python
-from browser_use import Tools, Agent, ActionResult, BrowserSession
-
-tools = Tools()
-
-@tools.action(description='Click the submit button using CSS selector')
-async def click_submit_button(browser_session: BrowserSession):
- # Get the current page
- page = await browser_session.must_get_current_page()
-
- # Get element(s) by CSS selector
- elements = await page.get_elements_by_css_selector('button[type="submit"]')
-
- if not elements:
- return ActionResult(extracted_content='No submit button found')
-
- # Click the first matching element
- await elements[0].click()
-
- return ActionResult(extracted_content='Submit button clicked!')
-```
-
-
-Available methods on `Page`:
-- `get_elements_by_css_selector(selector: str)` - Returns list of matching elements
-- `get_element_by_prompt(prompt: str, llm)` - Returns element or None using LLM
-- `must_get_element_by_prompt(prompt: str, llm)` - Returns element or raises error
-
-Available methods on `Element`:
-- `click()` - Click the element
-- `type(text: str)` - Type text into the element
-- `get_text()` - Get element text content
-- See `browser_use/actor/element.py` for more methods
-
-## Pydantic Input
-
-You can use Pydantic for the tool parameters:
-
-```python
-from pydantic import BaseModel
-
-class Cars(BaseModel):
- name: str = Field(description='The name of the car, e.g. "Toyota Camry"')
- price: int = Field(description='The price of the car as int in USD, e.g. 25000')
-
-@tools.action(description='Save cars to file')
-def save_cars(cars: list[Cars]) -> str:
- with open('cars.json', 'w') as f:
- json.dump(cars, f)
- return f'Saved {len(cars)} cars to file'
-
-task = "find cars and save them to file"
-```
-## Domain Restrictions
-
-Limit tools to specific domains:
-
-```python
-@tools.action(
- description='Fill out banking forms',
- allowed_domains=['https://mybank.com']
-)
-def fill_bank_form(account_number: str) -> str:
- # Only works on mybank.com
- return f'Filled form for account {account_number}'
-```
-
-## Advanced Example
-
-For a comprehensive example of custom tools with Playwright integration, see:
-**[Playwright Integration Example](https://github.com/browser-use/browser-use/blob/main/examples/browser/playwright_integration.py)**
-
-This shows how to create custom actions that use Playwright's precise browser automation alongside Browser-Use.
-
-## Common Pitfalls
-
-
-The agent injects special parameters **by name**, not by type. Using incorrect parameter names is the most common cause of tools failing silently.
-
-
-### ❌ Wrong: Using `browser: Browser`
-
-```python
-from browser_use import Tools, ActionResult, Browser
-
-@tools.action('My action')
-def my_action(browser: Browser) -> ActionResult: # WRONG!
- # This will NOT receive the browser session
- pass
-```
-
-### ✅ Correct: Using `browser_session: BrowserSession`
-
-```python
-from browser_use import Tools, ActionResult, BrowserSession
-
-@tools.action('My action')
-async def my_action(browser_session: BrowserSession) -> ActionResult: # CORRECT!
- page = await browser_session.must_get_current_page()
- # Now you have access to the browser
- return ActionResult(extracted_content='Done')
-```
-
-### Key Points
-
-1. **Use `browser_session: BrowserSession`** - not `browser: Browser`
-2. **Use `async` functions** - recommended for consistency with browser operations
-3. **Return `ActionResult`** - not plain strings (though strings work, `ActionResult` provides more control)
-4. **Parameter names must match exactly** - see [Available Objects](#available-objects) for the full list of injectable parameters
diff --git a/docs/customize/tools/available.mdx b/docs/customize/tools/available.mdx
deleted file mode 100644
index 5248a9f94..000000000
--- a/docs/customize/tools/available.mdx
+++ /dev/null
@@ -1,48 +0,0 @@
----
-title: "Available Tools"
-description: "Here is the [source code](https://github.com/browser-use/browser-use/blob/main/browser_use/tools/service.py) for the default tools:"
-icon: "list"
-mode: "wide"
----
-
-
-
-
-### Navigation & Browser Control
-- **`search`** - Search queries (DuckDuckGo, Google, Bing)
-- **`navigate`** - Navigate to URLs
-- **`go_back`** - Go back in browser history
-- **`wait`** - Wait for specified seconds
-
-### Page Interaction
-- **`click`** - Click elements by their index
-- **`input`** - Input text into form fields
-- **`upload_file`** - Upload files to file inputs
-- **`scroll`** - Scroll the page up/down
-- **`find_text`** - Scroll to specific text on page
-- **`send_keys`** - Send special keys (Enter, Escape, etc.)
-
-### JavaScript Execution
-- **`evaluate`** - Execute custom JavaScript code on the page (for advanced interactions, shadow DOM, custom selectors, data extraction)
-
-### Tab Management
-- **`switch`** - Switch between browser tabs
-- **`close`** - Close browser tabs
-
-### Content Extraction
-- **`extract`** - Extract data from webpages using LLM
-
-### Visual Analysis
-- **`screenshot`** - Request a screenshot in your next browser state for visual confirmation
-
-### Form Controls
-- **`dropdown_options`** - Get dropdown option values
-- **`select_dropdown`** - Select dropdown options
-
-### File Operations
-- **`write_file`** - Write content to files
-- **`read_file`** - Read file contents
-- **`replace_file`** - Replace text in files
-
-### Task Completion
-- **`done`** - Complete the task (always available)
diff --git a/docs/customize/tools/basics.mdx b/docs/customize/tools/basics.mdx
deleted file mode 100644
index 233d0f1c0..000000000
--- a/docs/customize/tools/basics.mdx
+++ /dev/null
@@ -1,36 +0,0 @@
----
-title: "Basics"
-description: "Tools are the functions that the agent has to interact with the world."
-icon: "play"
-mode: "wide"
----
-
-
-## Quick Example
-
-
-```python
-from browser_use import Tools, ActionResult, BrowserSession
-
-tools = Tools()
-
-@tools.action('Ask human for help with a question')
-async def ask_human(question: str, browser_session: BrowserSession) -> ActionResult:
- answer = input(f'{question} > ')
- return ActionResult(extracted_content=f'The human responded with: {answer}')
-
-agent = Agent(
- task='Ask human for help',
- llm=llm,
- tools=tools,
-)
-```
-
-
-**Important**: The parameter must be named exactly `browser_session` with type `BrowserSession` (not `browser: Browser`).
-The agent injects parameters by name matching, so using the wrong name will cause your tool to fail silently.
-
-
-
-Use `browser_session` parameter in tools for deterministic [Actor](/customize/actor/basics) actions.
-
diff --git a/docs/customize/tools/remove.mdx b/docs/customize/tools/remove.mdx
deleted file mode 100644
index 804a74c55..000000000
--- a/docs/customize/tools/remove.mdx
+++ /dev/null
@@ -1,14 +0,0 @@
----
-title: "Remove Tools"
-description: "You can exclude default tools:"
-icon: "minus"
-mode: "wide"
----
-
-
-```python
-from browser_use import Tools
-
-tools = Tools(exclude_actions=['search', 'wait'])
-agent = Agent(task='...', llm=llm, tools=tools)
-```
diff --git a/docs/customize/tools/response.mdx b/docs/customize/tools/response.mdx
deleted file mode 100644
index 53f3ef6e5..000000000
--- a/docs/customize/tools/response.mdx
+++ /dev/null
@@ -1,79 +0,0 @@
----
-title: "Tool Response"
-description: ""
-icon: "arrow-turn-down-left"
-mode: "wide"
----
-
-Tools return results using `ActionResult` or simple strings.
-
-## Return Types
-
-```python
-@tools.action('My tool')
-def my_tool() -> str:
- return "Task completed successfully"
-
-@tools.action('Advanced tool')
-def advanced_tool() -> ActionResult:
- return ActionResult(
- extracted_content="Main result",
- long_term_memory="Remember this info",
- error="Something went wrong",
- is_done=True,
- success=True,
- attachments=["file.pdf"],
- )
-```
-
-## ActionResult Properties
-
-- `extracted_content` (default: `None`) - Main result passed to LLM, this is equivalent to returning a string.
-- `include_extracted_content_only_once` (default: `False`) - Set to `True` for large content to include it only once in the LLM input.
-- `long_term_memory` (default: `None`) - This is always included in the LLM input for all future steps.
-- `error` (default: `None`) - Error message, we catch exceptions and set this automatically. This is always included in the LLM input.
-- `is_done` (default: `False`) - Tool completes entire task
-- `success` (default: `None`) - Task success (only valid with `is_done=True`)
-- `attachments` (default: `None`) - Files to show user
-- `metadata` (default: `None`) - Debug/observability data
-
-## Why `extracted_content` and `long_term_memory`?
-With this you control the context for the LLM.
-
-### 1. Include short content always in context
-```python
-def simple_tool() -> str:
- return "Hello, world!" # Keep in context for all future steps
-```
-
-### 2. Show long content once, remember subset in context
-```python
-return ActionResult(
- extracted_content="[500 lines of product data...]", # Shows to LLM once
- include_extracted_content_only_once=True, # Never show full output again
- long_term_memory="Found 50 products" # Only this in future steps
-)
-```
-We save the full `extracted_content` to files which the LLM can read in future steps.
-
-### 3. Dont show long content, remember subset in context
-```python
-return ActionResult(
- extracted_content="[500 lines of product data...]", # The LLM never sees this because `long_term_memory` overrides it and `include_extracted_content_only_once` is not used
- long_term_memory="Saved user's favorite products", # This is shown to the LLM in future steps
-)
-```
-
-## Terminating the Agent
-
-Set `is_done=True` to stop the agent completely. Use when your tool finishes the entire task:
-
-```python
-@tools.action(description='Complete the task')
-def finish_task() -> ActionResult:
- return ActionResult(
- extracted_content="Task completed!",
- is_done=True, # Stops the agent
- success=True # Task succeeded
- )
-```
diff --git a/docs/development.mdx b/docs/development.mdx
deleted file mode 100644
index 18b7c432f..000000000
--- a/docs/development.mdx
+++ /dev/null
@@ -1,129 +0,0 @@
----
-title: 'Development'
-description: 'Preview changes locally to update your docs'
-mode: "wide"
----
-
-
- **Prerequisite**: Please install Node.js (version 19 or higher) before proceeding.
-
-
-Follow these steps to install and run Mintlify on your operating system:
-
-**Step 1**: Install Mintlify:
-
-
-
-```bash npm
-npm i -g mintlify
-```
-
-```bash yarn
-yarn global add mintlify
-```
-
-
-
-**Step 2**: Navigate to the docs directory (where the `mint.json` file is located) and execute the following command:
-
-```bash
-mintlify dev
-```
-
-A local preview of your documentation will be available at `http://localhost:3000`.
-
-### Custom Ports
-
-By default, Mintlify uses port 3000. You can customize the port Mintlify runs on by using the `--port` flag. To run Mintlify on port 3333, for instance, use this command:
-
-```bash
-mintlify dev --port 3333
-```
-
-If you attempt to run Mintlify on a port that's already in use, it will use the next available port:
-
-```md
-Port 3000 is already in use. Trying 3001 instead.
-```
-
-## Mintlify Versions
-
-Please note that each CLI release is associated with a specific version of Mintlify. If your local website doesn't align with the production version, please update the CLI:
-
-
-
-```bash npm
-npm i -g mintlify@latest
-```
-
-```bash yarn
-yarn global upgrade mintlify
-```
-
-
-
-## Validating Links
-
-The CLI can assist with validating reference links made in your documentation. To identify any broken links, use the following command:
-
-```bash
-mintlify broken-links
-```
-
-## Deployment
-
-
- Unlimited editors available under the [Pro
- Plan](https://mintlify.com/pricing) and above.
-
-
-If the deployment is successful, you should see the following:
-
-
-
-
-
-## Code Formatting
-
-We suggest using extensions on your IDE to recognize and format MDX. If you're a VSCode user, consider the [MDX VSCode extension](https://marketplace.visualstudio.com/items?itemName=unifiedjs.vscode-mdx) for syntax highlighting, and [Prettier](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode) for code formatting.
-
-## Troubleshooting
-
-
-
-
- This may be due to an outdated version of node. Try the following:
- 1. Remove the currently-installed version of mintlify: `npm remove -g mintlify`
- 2. Upgrade to Node v19 or higher.
- 3. Reinstall mintlify: `npm install -g mintlify`
-
-
-
-
- Solution: Go to the root of your device and delete the \~/.mintlify folder. Afterwards, run `mintlify dev` again.
-
-
-
-Curious about what changed in the CLI version? [Check out the CLI changelog.](https://www.npmjs.com/package/mintlify?activeTab=versions)
-
-# Development Workflow
-
-## Branches
-- **`stable`**: Mirrors the latest stable release. This branch is updated only when a new stable release is published (every few weeks).
-- **`main`**: The primary development branch. This branch is updated frequently (every hour or more).
-
-## Tags
-- **`x.x.x`**: Stable release tags. These are created for stable releases and updated every few weeks.
-- **`x.x.xrcXX`**: Pre-release tags. These are created for unstable pre-releases and updated every Friday at 5 PM UTC.
-
-## Workflow Summary
-1. **Push to `main`**:
- - Runs pre-commit hooks to fix formatting.
- - Executes tests to ensure code quality.
-
-2. **Release a new version**:
- - If the tag is a pre-release (`x.x.xrcXX`), the package is pushed to PyPI as a pre-release.
- - If the tag is a stable release (`x.x.x`), the package is pushed to PyPI as a stable release, and the `stable` branch is updated to match the release.
-
-3. **Scheduled Pre-Releases**:
- - Every Friday at 5 PM UTC, a new pre-release tag (`x.x.xrcXX`) is created from the `main` branch and pushed to the repository.
diff --git a/docs/development/get-help.mdx b/docs/development/get-help.mdx
deleted file mode 100644
index 825af919b..000000000
--- a/docs/development/get-help.mdx
+++ /dev/null
@@ -1,11 +0,0 @@
----
-title: "Get Help"
-description: "More than 20k developers help each other"
-icon: "circle-question"
-mode: "wide"
----
-
-
-1. Check our [GitHub Issues](https://github.com/browser-use/browser-use/issues)
-2. Ask in our [Discord community](https://link.browser-use.com/discord)
-3. Get support for your enterprise with support@browser-use.com
diff --git a/docs/development/monitoring/costs.mdx b/docs/development/monitoring/costs.mdx
deleted file mode 100644
index 069c8bf60..000000000
--- a/docs/development/monitoring/costs.mdx
+++ /dev/null
@@ -1,29 +0,0 @@
----
-title: "Costs"
-description: "Track token usage and API costs for your browser automation tasks"
-icon: "dollar-sign"
-mode: "wide"
----
-
-## Cost Tracking
-
-To track token usage and costs, enable cost calculation:
-
-```python
-from browser_use import Agent, ChatBrowserUse
-
-agent = Agent(
- task="Search for latest news about AI",
- llm=ChatBrowserUse(),
- calculate_cost=True # Enable cost tracking
-)
-
-history = await agent.run()
-
-# Get usage from history
-print(f"Token usage: {history.usage}")
-
-# Or get from usage summary
-usage_summary = await agent.token_cost_service.get_usage_summary()
-print(f"Usage summary: {usage_summary}")
-```
diff --git a/docs/development/monitoring/observability.mdx b/docs/development/monitoring/observability.mdx
deleted file mode 100644
index 937bad736..000000000
--- a/docs/development/monitoring/observability.mdx
+++ /dev/null
@@ -1,101 +0,0 @@
----
-title: "Observability"
-description: "Trace Browser Use's agent execution steps and capture browser session recording"
-icon: "eye"
-mode: "wide"
----
-
-## Overview
-
-Browser Use has a native integration with [Laminar](https://laminar.sh) - open-source platform for monitoring and analyzing error patterns in AI agents.
-Laminar SDK automatically captures **agent execution steps, costs and browser session recordings** of Browser Use agent.
-Browser session recordings allows developers to see full video replay of the browser session, which is useful for debugging Browser Use agent.
-
-## Setup
-
-Install Laminar python SDK.
-```bash
-pip install lmnr
-```
-
-Register on [Laminar Cloud](https://laminar.sh) or [self-host Laminar](https://github.com/lmnr-ai/lmnr), create a project and get the project API key from your project settings. Set the `LMNR_PROJECT_API_KEY` environment variable.
-```bash
-export LMNR_PROJECT_API_KEY=
-```
-
-## Usage
-
-Then, you simply initialize the Laminar at the top of your project and both Browser Use agent traces and session recordings will be automatically captured.
-
-```python {7-9}
-from browser_use import Agent, ChatGoogle
-import asyncio
-
-from lmnr import Laminar
-import os
-
-# At initialization time, Laminar auto-instruments
-# Browser Use and any browser you use (local or remote)
-Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
-
-async def main():
- agent = Agent(
- task="go to ycombinator.com, summarize 3 startups from the latest batch",
- llm=ChatGoogle(model="gemini-2.5-flash"),
- )
- await agent.run()
-
-asyncio.run(main())
-```
-
-## Viewing Traces
-
-You can view traces in the Laminar UI by going to the traces tab in your project.
-When you select a trace, you can see both the browser session recording and the agent execution steps.
-
-Timeline of the browser session is synced with the agent execution steps.
-In the trace view, you can also see the agent's current step, the tool it's using, and the tool's input and output.
-
-
-
-## Laminar
-
-To learn more about how you can trace and evaluate your Browser Use agent with Laminar, check out [Laminar docs](https://docs.lmnr.ai).
-
-## Browser Use Cloud Authentication
-
-Browser Use can sync your agent runs to the cloud for easy viewing and sharing. Authentication is required to protect your data.
-
-### Quick Setup
-
-```bash
-# Authenticate once to enable cloud sync for all future runs
-browser-use auth
-# Or if using module directly:
-python -m browser_use.cli auth
-```
-
-**Note**: Cloud sync is enabled by default. If you've disabled it, you can re-enable with `export BROWSER_USE_CLOUD_SYNC=true`.
-
-### Manual Authentication
-
-```python
-# Authenticate from code after task completion
-from browser_use import Agent
-
-agent = Agent(task="your task")
-await agent.run()
-
-# Later, authenticate for future runs
-await agent.authenticate_cloud_sync()
-```
-
-### Reset Authentication
-
-```bash
-# Force re-authentication with a different account
-rm ~/.config/browseruse/cloud_auth.json
-browser-use auth
-```
-
-**Note**: Authentication uses OAuth Device Flow - you must complete the auth process while the command is running. Links expire when the polling stops.
diff --git a/docs/development/monitoring/openlit.mdx b/docs/development/monitoring/openlit.mdx
deleted file mode 100644
index 27f015ec8..000000000
--- a/docs/development/monitoring/openlit.mdx
+++ /dev/null
@@ -1,130 +0,0 @@
----
-title: "OpenLIT"
-description: "Complete observability for Browser Use with OpenLIT tracing"
-icon: "chart-line"
-mode: "wide"
----
-
-## Overview
-
-Browser Use has native integration with [OpenLIT](https://github.com/openlit/openlit) - an open-source opentelemetry-native platform that provides complete, granular traces for every task your browser-use agent performs—from high-level agent invocations down to individual browser actions.
-
-Read more about OpenLIT in the [OpenLIT docs](https://docs.openlit.io).
-
-## Setup
-
-Install OpenLIT alongside Browser Use:
-
-```bash
-pip install openlit browser-use
-```
-
-## Usage
-
-OpenLIT provides automatic, comprehensive instrumentation with **zero code changes** beyond initialization:
-
-```python {5-6}
-from browser_use import Agent, Browser, ChatOpenAI
-import asyncio
-import openlit
-
-# Initialize OpenLIT - that's it!
-openlit.init()
-
-async def main():
- browser = Browser()
-
- llm = ChatOpenAI(
- model="gpt-4o",
- )
-
- agent = Agent(
- task="Find the number trending post on Hacker news",
- llm=llm,
- browser=browser,
- )
-
- history = await agent.run()
- return history
-
-if __name__ == "__main__":
- history = asyncio.run(main())
-```
-
-## Viewing Traces
-
-OpenLIT provides a powerful dashboard where you can:
-
-### Monitor Execution Flows
-See the complete execution tree with timing information for every span. Click on any `invoke_model` span to see the exact prompt sent to the LLM and the complete response with agent reasoning.
-
-### Track Costs and Token Usage
-- Cost breakdown by agent, task, and model
-- Token usage per LLM call with full input/output visibility
-- Compare costs across different LLM providers
-- Identify expensive prompts and optimize them
-
-### Debug Failures with Agent Thoughts
-When an automation fails, you can:
-- See exactly which step failed
-- Read the agent's thinking at the failure point
-- Check the browser state and available elements
-- Analyze whether the failure was due to bad reasoning or bad information
-- Fix the root cause with full context
-
-### Performance Optimization
-- Identify slow steps (LLM calls vs browser actions vs HTTP requests)
-- Compare execution times across runs
-- Optimize max_steps and max_actions_per_step
-- Track HTTP request latency for page navigations
-
-## Configuration
-
-### Custom OpenTelemetry Endpoint Configuration
-
-```python
-import openlit
-
-# Configure custom OTLP endpoints
-openlit.init(
- otlp_endpoint="http://localhost:4318",
- application_name="my-browser-automation",
- environment="production"
-)
-```
-
-### Environment Variables
-
-You can also configure OpenLIT via environment variables:
-
-```bash
-export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
-export OTEL_SERVICE_NAME="browser-automation"
-export OTEL_ENVIRONMENT="production"
-```
-
-### Self-Hosted OpenLIT
-
-If you prefer to keep your data on-premises:
-
-```bash
-# Using Docker
-docker run -d \
- -p 4318:4318 \
- -p 3000:3000 \
- openlit/openlit:latest
-
-# Access dashboard at http://localhost:3000
-```
-
-## Integration with Existing Tools
-
-OpenLIT uses OpenTelemetry under the hood, so it integrates seamlessly with:
-- **Jaeger** - Distributed tracing visualization
-- **Prometheus** - Metrics collection and alerting
-- **Grafana** - Custom dashboards and analytics
-- **Datadog** - APM and log management
-- **New Relic** - Full-stack observability
-- **Elastic APM** - Application performance monitoring
-
-Simply configure OpenLIT to export to your existing OTLP-compatible endpoint.
diff --git a/docs/development/monitoring/telemetry.mdx b/docs/development/monitoring/telemetry.mdx
deleted file mode 100644
index 580e4097b..000000000
--- a/docs/development/monitoring/telemetry.mdx
+++ /dev/null
@@ -1,31 +0,0 @@
----
-title: "Telemetry"
-description: "Understanding Browser Use's telemetry"
-icon: "chart-mixed"
-mode: "wide"
----
-
-## Overview
-
-Browser Use is free under the MIT license. To help us continue improving the library, we collect anonymous usage data with [PostHog](https://posthog.com) . This information helps us understand how the library is used, fix bugs more quickly, and prioritize new features.
-
-
-## Opting Out
-
-You can disable telemetry by setting the environment variable:
-
-```bash .env
-ANONYMIZED_TELEMETRY=false
-```
-
-Or in your Python code:
-
-```python
-import os
-os.environ["ANONYMIZED_TELEMETRY"] = "false"
-```
-
-
- Even when enabled, telemetry has zero impact on the library's performance. Code is available in [Telemetry
- Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry).
-
diff --git a/docs/development/n8n-integration.mdx b/docs/development/n8n-integration.mdx
deleted file mode 100644
index d89c7b26c..000000000
--- a/docs/development/n8n-integration.mdx
+++ /dev/null
@@ -1,123 +0,0 @@
----
-title: 'n8n Integration'
-description: 'Learn how to integrate Browser Use with n8n workflows'
-mode: "wide"
----
-
-# Browser Use n8n Integration
-
-Browser Use can be integrated with [n8n](https://n8n.io), a workflow automation platform, using our community node. This integration allows you to trigger browser automation tasks directly from your n8n workflows.
-
-## Installing the n8n Community Node
-
-There are several ways to install the Browser Use community node in n8n:
-
-### Using n8n Desktop or Cloud
-
-1. Navigate to **Settings > Community Nodes**
-2. Click on **Install**
-3. Enter `n8n-nodes-browser-use` in the **Name** field
-4. Click **Install**
-
-### Using a Self-hosted n8n Instance
-
-Run the following command in your n8n installation directory:
-
-```bash
-npm install n8n-nodes-browser-use
-```
-
-### For Development
-
-If you want to develop with the n8n node:
-
-1. Clone the repository:
- ```bash
- git clone https://github.com/draphonix/n8n-nodes-browser-use.git
- ```
-2. Install dependencies:
- ```bash
- cd n8n-nodes-browser-use
- npm install
- ```
-3. Build the code:
- ```bash
- npm run build
- ```
-4. Link to your n8n installation:
- ```bash
- npm link
- ```
-5. In your n8n installation directory:
- ```bash
- npm link n8n-nodes-browser-use
- ```
-
-## Setting Up Browser Use Cloud API Credentials
-
-To use the Browser Use node in n8n, you need to configure API credentials:
-
-1. Sign up for an account at [Browser Use Cloud](https://cloud.browser-use.com/new-api-key)
-2. Navigate to the Settings or API section
-3. Generate or copy your API key
-4. In n8n, create a new credential:
- - Go to **Credentials** tab
- - Click **Create New**
- - Select **Browser Use Cloud API**
- - Enter your API key
- - Save the credential
-
-## Using the Browser Use Node
-
-Once installed, you can add the Browser Use node to your workflows:
-
-1. In your workflow editor, search for "Browser Use" in the nodes panel
-2. Add the node to your workflow
-3. Set-up the credentials
-4. Choose your saved credentials
-5. Select an operation:
- - **Run Task**: Execute a browser automation task with natural language instructions
- - **Get Task**: Retrieve task details
- - **Get Task Status**: Check task execution status
- - **Pause/Resume/Stop Task**: Control running tasks
- - **Get Task Media**: Retrieve screenshots, videos, or PDFs
- - **List Tasks**: Get a list of tasks
-
-### Example: Running a Browser Task
-
-Here's a simple example of how to use the Browser Use node to run a browser task:
-
-1. Add the Browser Use node to your workflow
-2. Select the "Run Task" operation
-3. In the "Instructions" field, enter a natural language description of what you want the browser to do, for example:
- ```
- Go to example.com, take a screenshot of the homepage, and extract all the main heading texts
- ```
-4. Optionally enable "Save Browser Data" to preserve cookies and session information
-5. Connect the node to subsequent nodes to process the results
-
-## Workflow Examples
-
-The Browser Use n8n node enables various automation scenarios:
-
-- **Web Scraping**: Extract data from websites on a schedule
-- **Form Filling**: Automate data entry across web applications
-- **Monitoring**: Check website status and capture visual evidence
-- **Report Generation**: Generate PDFs or screenshots of web dashboards
-- **Multi-step Processes**: Chain browser tasks together using session persistence
-
-## Troubleshooting
-
-If you encounter issues with the Browser Use node:
-
-- Verify your API key is valid and has sufficient credits
-- Check that your instructions are clear and specific
-- For complex tasks, consider breaking them into multiple steps
-- Refer to the [Browser Use documentation](https://docs.browser-use.com) for instruction best practices
-
-## Resources
-
-- [n8n Community Nodes Documentation](https://docs.n8n.io/integrations/community-nodes/)
-- [Browser Use Documentation](https://docs.browser-use.com)
-- [Browser Use Cloud](https://cloud.browser-use.com)
-- [n8n-nodes-browser-use GitHub Repository](https://github.com/draphonix/n8n-nodes-browser-use)
diff --git a/docs/development/roadmap.mdx b/docs/development/roadmap.mdx
deleted file mode 100644
index 4ff49e8bc..000000000
--- a/docs/development/roadmap.mdx
+++ /dev/null
@@ -1,8 +0,0 @@
----
-title: "Roadmap"
-description: "Future plans and upcoming features for Browser Use"
-icon: "road"
-mode: "wide"
----
-
-Big things coming soon!
diff --git a/docs/development/setup/contribution-guide.mdx b/docs/development/setup/contribution-guide.mdx
deleted file mode 100644
index b736a1349..000000000
--- a/docs/development/setup/contribution-guide.mdx
+++ /dev/null
@@ -1,37 +0,0 @@
----
-title: "Contribution Guide"
-description: ""
-icon: "handshake"
-mode: "wide"
----
-
-## Mission
-
-- Make developers happy
-- Do more clicks than human
-- Tell your computer what to do, and it gets it done.
-- Make agents faster and more reliable.
-
-
-## What to work on?
-
-- This space is moving fast. We have 10 ideas daily. Let's exchange some.
-- Browse our [GitHub Issues](https://github.com/browser-use/browser-use/issues)
-- Check out our most active issues on [Discord](https://discord.gg/zXJJHtJf3k)
-- Get inspiration in [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel
-
-
-## What makes a great PR?
-
-1. Why do we need this PR?
-2. Include a demo screenshot/gif
-3. Make sure the PR passes all CI tests
-4. Keep your PR focused on a single feature
-
-
-## How?
-1. Fork the repository
-2. Create a new branch for your feature
-3. Submit a PR
-
-We are overwhelmed with Issues. Feel free to bump your issues/PRs with comments periodically if you need faster feedback.
diff --git a/docs/development/setup/local-setup.mdx b/docs/development/setup/local-setup.mdx
deleted file mode 100644
index 3737eab76..000000000
--- a/docs/development/setup/local-setup.mdx
+++ /dev/null
@@ -1,49 +0,0 @@
----
-title: "Local Setup"
-description: "We're excited to have you join our community of contributors. "
-icon: "laptop-code"
-mode: "wide"
----
-
-## Welcome to Browser Use Development!
-
-```bash
-git clone https://github.com/browser-use/browser-use
-cd browser-use
-uv sync --all-extras --dev
-# or pip install -U git+https://github.com/browser-use/browser-use.git@main
-```
-
-## Configuration
-
-Set up your environment variables:
-
-```bash
-# Copy the example environment file
-cp .env.example .env
-
-# set logging level
-# BROWSER_USE_LOGGING_LEVEL=debug
-```
-
-
-## Helper Scripts
-For common development tasks
-```bash
-# Complete setup script - installs uv, creates a venv, and installs dependencies
-./bin/setup.sh
-
-# Run all pre-commit hooks (formatting, linting, type checking)
-./bin/lint.sh
-
-# Run the core test suite that's executed in CI
-./bin/test.sh
-```
-
-
-
-## Run examples
-
-```bash
-uv run examples/simple.py
-```
diff --git a/docs/docs.json b/docs/docs.json
deleted file mode 100644
index f26a5db8c..000000000
--- a/docs/docs.json
+++ /dev/null
@@ -1,350 +0,0 @@
-{
- "$schema": "https://mintlify.com/docs.json",
- "theme": "aspen",
- "name": "Browser Use",
- "colors": {
- "primary": "#FE750E",
- "light": "#FE750E",
- "dark": "#FE750E"
- },
- "background": {
- "color": {
- "light": "#FFFFFF",
- "dark": "#09090B"
- }
- },
- "favicon": "/favicon.ico",
- "contextual": {
- "options": [
- "copy",
- "view"
- ]
- },
- "fonts": {
- "family": "Geist"
- },
- "integrations": {
- "posthog": {
- "apiKey": "phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh"
- }
- },
- "redirects": [
- {
- "source": "/customize/supported-models",
- "destination": "/supported-models"
- },
- {
- "source": "/customize/agent/supported-models",
- "destination": "/supported-models"
- },
- {
- "source": "/customize/agent-settings",
- "destination": "/customize/agent/all-parameters"
- },
- {
- "source": "/customize/browser-settings",
- "destination": "/customize/browser/all-parameters"
- },
- {
- "source": "/customize/custom-functions",
- "destination": "/customize/tools/add"
- },
- {
- "source": "/customize/system-prompt",
- "destination": "/customize/agent/all-parameters#system-messages"
- },
- {
- "source": "/development/evaluations",
- "destination": "/development/setup/contribution-guide"
- },
- {
- "source": "/cli",
- "destination": "/quickstart"
- },
- {
- "source": "/development/local-setup",
- "destination": "/development/setup/local-setup"
- },
- {
- "source": "/development/contribution-guide",
- "destination": "/development/setup/contribution-guide"
- },
- {
- "source": "/development/telemetry",
- "destination": "/development/monitoring/telemetry"
- },
- {
- "source": "/development/observability",
- "destination": "/development/monitoring/observability"
- },
- {
- "source": "/development/hooks",
- "destination": "/customize/hooks"
- },
- {
- "source": "/customize/mcp-server",
- "destination": "/customize/integrations/mcp-server"
- },
- {
- "source": "/customize/examples/chain-agents",
- "destination": "/customize/examples/follow-up-tasks"
- },
- {
- "source": "/customize/examples/fast-agent",
- "destination": "/examples/templates/fast-agent"
- },
- {
- "source": "/customize/examples/follow-up-tasks",
- "destination": "/examples/templates/follow-up-tasks"
- },
- {
- "source": "/customize/examples/parallel-browser",
- "destination": "/examples/templates/parallel-browser"
- },
- {
- "source": "/customize/examples/playwright-integration",
- "destination": "/examples/templates/playwright-integration"
- },
- {
- "source": "/customize/examples/sensitive-data",
- "destination": "/examples/templates/sensitive-data"
- },
- {
- "source": "/customize/examples/secure",
- "destination": "/examples/templates/secure"
- },
- {
- "source": "/customize/examples/more-examples",
- "destination": "/examples/templates/more-examples"
- },
- {
- "source": "/customize/examples/ad-use",
- "destination": "/examples/apps/ad-use"
- },
- {
- "source": "/customize/examples/vibetest-use",
- "destination": "/examples/apps/vibetest-use"
- },
- {
- "source": "/customize/examples/prompting-guide",
- "destination": "/customize/agent/prompting-guide"
- },
- {
- "source": "/api-reference/browser-profiles/get-browser-profile",
- "destination": "https://docs.cloud.browser-use.com/api-reference/v-2-api-current/profiles/get-profile-profiles-profile-id-get"
- },
- {
- "source": "/cloud/v2/quickstart",
- "destination": "https://docs.cloud.browser-use.com/get-started/quickstart"
- },
- {
- "source": "/cloud/v1/*",
- "destination": "https://docs.cloud.browser-use.com"
- }
- ],
- "navigation": {
- "groups": [
- {
- "group": "Get Started",
- "pages": [
- "introduction",
- "quickstart",
- "quickstart_llm",
- "supported-models",
- "production"
- ]
- },
- {
- "group": "Customize",
- "pages": [
- {
- "group": "Agent",
- "icon": "robot",
- "isDefaultOpen": true,
- "pages": [
- "customize/agent/basics",
- "customize/agent/prompting-guide",
- "customize/agent/output-format",
- "customize/agent/all-parameters"
- ]
- },
- {
- "group": "Code Agent",
- "icon": "code",
- "isDefaultOpen": false,
- "pages": [
- "customize/code-agent/basics",
- "customize/code-agent/output-format",
- "customize/code-agent/exporting",
- "customize/code-agent/all-parameters",
- "customize/code-agent/example-products"
- ]
- },
- {
- "group": "Browser",
- "icon": "window",
- "isDefaultOpen": false,
- "pages": [
- "customize/browser/basics",
- "customize/browser/real-browser",
- "customize/browser/remote",
- "customize/browser/all-parameters"
- ]
- },
- {
- "group": "Tools",
- "icon": "wrench",
- "isDefaultOpen": false,
- "pages": [
- "customize/tools/basics",
- "customize/tools/available",
- "customize/tools/add",
- "customize/tools/remove",
- "customize/tools/response"
- ]
- },
- {
- "group": "Skills",
- "icon": "sparkles",
- "isDefaultOpen": false,
- "pages": [
- "customize/skills/basics"
- ]
- },
- {
- "group": "Actor",
- "icon": "terminal",
- "isDefaultOpen": false,
- "pages": [
- "customize/actor/basics",
- "customize/actor/examples",
- "customize/actor/all-parameters"
- ]
- },
- {
- "group": "Integration",
- "icon": "plug",
- "isDefaultOpen": false,
- "pages": [
- "customize/integrations/docs-mcp",
- "customize/integrations/mcp-server"
- ]
- },
- {
- "group": "Sandbox",
- "icon": "box",
- "isDefaultOpen": false,
- "pages": [
- "customize/sandbox/quickstart",
- "customize/sandbox/events",
- "customize/sandbox/all-parameters"
- ]
- }
- ]
- },
- {
- "group": "Examples",
- "pages": [
- {
- "group": "Templates",
- "icon": "folder",
- "pages": [
- "examples/templates/fast-agent",
- "examples/templates/follow-up-tasks",
- "examples/templates/parallel-browser",
- "examples/templates/playwright-integration",
- "examples/templates/sensitive-data",
- "examples/templates/secure",
- "examples/templates/more-examples"
- ]
- },
- {
- "group": "Apps",
- "icon": "box-open",
- "pages": [
- "examples/apps/ad-use",
- "examples/apps/vibetest-use",
- "examples/apps/news-use",
- "examples/apps/msg-use"
- ]
- }
- ]
- },
- {
- "group": "Development",
- "pages": [
- {
- "group": "Contribution",
- "icon": "github",
- "isDefaultOpen": true,
- "pages": [
- "development/setup/local-setup",
- "development/setup/contribution-guide"
- ]
- },
- {
- "group": "Advanced",
- "icon": "gear",
- "isDefaultOpen": false,
- "pages": [
- "customize/hooks"
- ]
- },
- {
- "group": "Monitoring",
- "icon": "chart-mixed",
- "isDefaultOpen": false,
- "pages": [
- "development/monitoring/observability",
- "development/monitoring/openlit",
- "development/monitoring/telemetry",
- "development/monitoring/costs"
- ]
- },
- "development/get-help"
- ]
- }
- ]
- },
- "logo": {
- "light": "/logo/light.svg",
- "dark": "/logo/dark.svg"
- },
- "api": {
- "playground": {
- "display": "interactive"
- },
- "examples": {
- "languages": [
- "javascript",
- "curl",
- "python"
- ],
- "required": true
- }
- },
- "navbar": {
- "links": [
- {
- "label": "Github",
- "href": "https://github.com/browser-use/browser-use"
- },
- {
- "label": "Discord",
- "href": "https://link.browser-use.com/discord"
- }
- ],
- "primary": {
- "type": "button",
- "label": "Browser Use Cloud",
- "href": "https://cloud.browser-use.com"
- }
- },
- "footer": {
- "socials": {
- "x": "https://x.com/browser_use",
- "github": "https://github.com/browser-use/browser-use",
- "linkedin": "https://linkedin.com/company/browser-use"
- }
- }
-}
diff --git a/docs/examples/apps/ad-use.mdx b/docs/examples/apps/ad-use.mdx
deleted file mode 100644
index 5849ef769..000000000
--- a/docs/examples/apps/ad-use.mdx
+++ /dev/null
@@ -1,95 +0,0 @@
----
-title: "Ad-Use (Ad Generator)"
-description: "Generate Instagram image ads and TikTok video ads from landing pages using browser agents, Google's Nano Banana 🍌, and Veo3."
-icon: "image"
-mode: "wide"
----
-
-
-This demo requires browser-use v0.7.6+.
-
-
-
-
-
-## Features
-
-1. Agent visits your target website
-2. Captures brand name, tagline, and key selling points
-3. Takes a clean screenshot for design reference
-4. Creates scroll-stopping Instagram image ads with 🍌
-5. Generates viral TikTok video ads with Veo3
-6. Supports parallel generation of multiple ads
-
-## Setup
-
-Make sure the newest version of browser-use is installed (with screenshot functionality):
-```bash
-pip install -U browser-use
-```
-
-Export your Gemini API key, get it from: [Google AI Studio](https://makersuite.google.com/app/apikey)
-```
-export GOOGLE_API_KEY='your-google-api-key-here'
-```
-
-Clone the repo and cd into the app folder
-```bash
-git clone https://github.com/browser-use/browser-use.git
-cd browser-use/examples/apps/ad-use
-```
-
-## Normal Usage
-
-```bash
-# Basic - Generate Instagram image ad (default)
-python ad_generator.py --url https://www.apple.com/iphone-16-pro/
-
-# Generate TikTok video ad with Veo3
-python ad_generator.py --tiktok --url https://www.apple.com/iphone-16-pro/
-
-# Generate multiple ads in parallel
-python ad_generator.py --instagram --count 3 --url https://www.apple.com/iphone-16-pro/
-python ad_generator.py --tiktok --count 2 --url https://www.apple.com/iphone-16-pro/
-
-# Debug Mode - See the browser in action
-python ad_generator.py --url https://www.apple.com/iphone-16-pro/ --debug
-```
-
-## Command Line Options
-
-- `--url`: Landing page URL to analyze
-- `--instagram`: Generate Instagram image ad (default if no flag specified)
-- `--tiktok`: Generate TikTok video ad using Veo3
-- `--count N`: Generate N ads in parallel (default: 1)
-- `--debug`: Show browser window and enable verbose logging
-
-## Programmatic Usage
-```python
-import asyncio
-from ad_generator import create_ad_from_landing_page
-
-async def main():
- results = await create_ad_from_landing_page(
- url="https://your-landing-page.com",
- debug=False
- )
- print(f"Generated ads: {results}")
-
-asyncio.run(main())
-```
-
-## Output
-
-Generated ads are saved in the `output/` directory with:
-- **PNG image files** (ad_timestamp.png) - Instagram ads generated with Gemini 2.5 Flash Image
-- **MP4 video files** (ad_timestamp.mp4) - TikTok ads generated with Veo3
-- **Analysis files** (analysis_timestamp.txt) - Browser agent analysis and prompts used
-- **Landing page screenshots** (landing_page_timestamp.png) - Reference screenshots
-
-## Source Code
-
-Full implementation: [https://github.com/browser-use/browser-use/tree/main/examples/apps/ad-use](https://github.com/browser-use/browser-use/tree/main/examples/apps/ad-use)
diff --git a/docs/examples/apps/msg-use.mdx b/docs/examples/apps/msg-use.mdx
deleted file mode 100644
index 1b4a87ed6..000000000
--- a/docs/examples/apps/msg-use.mdx
+++ /dev/null
@@ -1,124 +0,0 @@
----
-title: "Msg-Use (WhatsApp Sender)"
-description: "AI-powered WhatsApp message scheduler using browser agents and Gemini. Schedule personalized messages in natural language."
-icon: "message"
-mode: "wide"
----
-
-
-This demo requires browser-use v0.7.7+.
-
-
-
-
-
-## Features
-
-1. Agent logs into WhatsApp Web automatically
-2. Parses natural language scheduling instructions
-3. Composes personalized messages using AI
-4. Schedules messages for future delivery or sends immediately
-5. Persistent session (no repeated QR scanning)
-
-## Setup
-
-Make sure the newest version of browser-use is installed:
-```bash
-pip install -U browser-use
-```
-
-Export your Gemini API key, get it from: [Google AI Studio](https://makersuite.google.com/app/apikey)
-```bash
-export GOOGLE_API_KEY='your-gemini-api-key-here'
-```
-
-Clone the repo and cd into the app folder
-```bash
-git clone https://github.com/browser-use/browser-use.git
-cd browser-use/examples/apps/msg-use
-```
-
-## Initial Login
-
-First-time setup requires QR code scanning:
-```bash
-python login.py
-```
-- Scan QR code when browser opens
-- Session will be saved for future use
-
-## Normal Usage
-
-1. **Edit your schedule** in `messages.txt`:
-```
-- Send "Hi" to Magnus on the 13.06 at 18:15
-- Tell hinge date (Camila) at 20:00 that I miss her
-- Send happy birthday message to sister on the 15.06
-- Remind mom to pick up the car next tuesday
-```
-
-2. **Test mode** - See what will be sent:
-```bash
-python scheduler.py --test
-```
-
-3. **Run scheduler**:
-```bash
-python scheduler.py
-
-# Debug Mode - See the browser in action
-python scheduler.py --debug
-
-# Auto Mode - Respond to unread messages every ~30 minutes
-python scheduler.py --auto
-```
-
-## Programmatic Usage
-
-```python
-import asyncio
-from scheduler import schedule_messages
-
-async def main():
- messages = [
- "Send hello to John at 15:30",
- "Remind Sarah about meeting tomorrow at 9am"
- ]
- await schedule_messages(messages, debug=False)
-
-asyncio.run(main())
-```
-
-## Example Output
-
-The scheduler processes natural language and outputs structured results:
-
-```json
-[
- {
- "contact": "Magnus",
- "original_message": "Hi",
- "composed_message": "Hi",
- "scheduled_time": "2025-06-13 18:15"
- },
- {
- "contact": "Camila",
- "original_message": "I miss her",
- "composed_message": "I miss you ❤️",
- "scheduled_time": "2025-06-14 20:00"
- },
- {
- "contact": "sister",
- "original_message": "happy birthday message",
- "composed_message": "Happy birthday! 🎉 Wishing you an amazing day, sis! Hope you have the best birthday ever! ❤️🎂🎈",
- "scheduled_time": "2025-06-15 09:00"
- }
-]
-```
-
-## Source Code
-
-Full implementation: [https://github.com/browser-use/browser-use/tree/main/examples/apps/msg-use](https://github.com/browser-use/browser-use/tree/main/examples/apps/msg-use)
diff --git a/docs/examples/apps/news-use.mdx b/docs/examples/apps/news-use.mdx
deleted file mode 100644
index 02f8f1a2b..000000000
--- a/docs/examples/apps/news-use.mdx
+++ /dev/null
@@ -1,133 +0,0 @@
----
-title: "News-Use (News Monitor)"
-description: "Monitor news websites and extract articles with sentiment analysis using browser agents and Google Gemini."
-icon: "newspaper"
-mode: "wide"
----
-
-
-This demo requires browser-use v0.7.7+.
-
-
-
-
-
-## Features
-
-1. Agent visits any news website automatically
-2. Finds and clicks the most recent headline article
-3. Extracts title, URL, posting time, and full content
-4. Generates short/long summaries with sentiment analysis
-5. Persistent deduplication across monitoring sessions
-
-## Setup
-
-Make sure the newest version of browser-use is installed:
-```bash
-pip install -U browser-use
-```
-
-Export your Gemini API key, get it from: [Google AI Studio](https://makersuite.google.com/app/apikey)
-```bash
-export GOOGLE_API_KEY='your-google-api-key-here'
-```
-
-Clone the repo, cd to the app
-```bash
-git clone https://github.com/browser-use/browser-use.git
-cd browser-use/examples/apps/news-use
-```
-
-## Usage Examples
-
-```bash
-# One-time extraction - Get the latest article and exit
-python news_monitor.py --once
-
-# Monitor Bloomberg continuously (default)
-python news_monitor.py
-
-# Monitor TechCrunch every 60 seconds
-python news_monitor.py --url https://techcrunch.com --interval 60
-
-# Debug mode - See browser in action
-python news_monitor.py --once --debug
-```
-
-## Output Format
-
-Articles are displayed with timestamp, sentiment emoji, and summary:
-
-```
-[2025-09-11 02:49:21] - 🟢 - Klarna's IPO raises $1.4B, benefiting existing investors
-[2025-09-11 02:54:15] - 🔴 - Tech layoffs continue as major firms cut workforce
-[2025-09-11 02:59:33] - 🟡 - Federal Reserve maintains interest rates unchanged
-```
-
-**Sentiment Indicators:**
-- 🟢 **Positive** - Good news, growth, success stories
-- 🟡 **Neutral** - Factual reporting, announcements, updates
-- 🔴 **Negative** - Challenges, losses, negative events
-
-## Data Persistence
-
-All extracted articles are saved to `news_data.json` with complete metadata:
-
-```json
-{
- "hash": "a1b2c3d4...",
- "pulled_at": "2025-09-11T02:49:21Z",
- "data": {
- "title": "Klarna's IPO pops, raising $1.4B",
- "url": "https://techcrunch.com/2025/09/11/klarna-ipo/",
- "posting_time": "12:11 PM PDT · September 10, 2025",
- "short_summary": "Klarna's IPO raises $1.4B, benefiting existing investors like Sequoia.",
- "long_summary": "Fintech Klarna successfully IPO'd on the NYSE...",
- "sentiment": "positive"
- }
-}
-```
-
-## Programmatic Usage
-
-```python
-import asyncio
-from news_monitor import extract_latest_article
-
-async def main():
- # Extract latest article from any news site
- result = await extract_latest_article(
- site_url="https://techcrunch.com",
- debug=False
- )
-
- if result["status"] == "success":
- article = result["data"]
- print(f"📰 {article['title']}")
- print(f"😊 Sentiment: {article['sentiment']}")
- print(f"📝 Summary: {article['short_summary']}")
-
-asyncio.run(main())
-```
-
-## Advanced Configuration
-
-```python
-# Custom monitoring with filters
-async def monitor_with_filters():
- while True:
- result = await extract_latest_article("https://bloomberg.com")
- if result["status"] == "success":
- article = result["data"]
- # Only alert on negative market news
- if article["sentiment"] == "negative" and "market" in article["title"].lower():
- send_alert(article)
- await asyncio.sleep(300) # Check every 5 minutes
-```
-
-## Source Code
-
-Full implementation: [https://github.com/browser-use/browser-use/tree/main/examples/apps/news-use](https://github.com/browser-use/browser-use/tree/main/examples/apps/news-use)
diff --git a/docs/examples/apps/vibetest-use.mdx b/docs/examples/apps/vibetest-use.mdx
deleted file mode 100644
index dbc829af7..000000000
--- a/docs/examples/apps/vibetest-use.mdx
+++ /dev/null
@@ -1,95 +0,0 @@
----
-title: "Vibetest-Use (Automated QA)"
-description: "Run multi-agent Browser-Use tests to catch UI bugs, broken links, and accessibility issues before they ship."
-icon: "bug"
-mode: "wide"
----
-
-
-Requires **browser-use < v0.5.0** and Playwright Chromium. Currently getting an update to v0.7.6+.
-
-
-
-
-
-## Features
-
-1. Launches multiple headless (or visible) Browser-Use agents in parallel
-2. Crawls your site and records screenshots, broken links & a11y issues
-3. Works on production URLs *and* `localhost` dev servers
-4. Simple natural-language prompts via MCP in Cursor / Claude Code
-
-## Quick Start
-
-```bash
-
-# 1. Clone repo
-git clone https://github.com/browser-use/vibetest-use.git
-cd vibetest-use
-
-# 2. Create & activate env
-uv venv --python 3.11
-source .venv/bin/activate
-
-# 3. Install project
-uv pip install -e .
-
-# 4. Install browser runtime once
-uvx browser-use install
-```
-
-### 1) Claude Code
-
-```bash
-# Register the MCP server
-claude mcp add vibetest /full/path/to/vibetest-use/.venv/bin/vibetest-mcp \
- -e GOOGLE_API_KEY="your_api_key"
-
-# Inside a Claude chat
-> /mcp
-# ⎿ MCP Server Status
-# • vibetest: connected
-```
-
-### 2) Cursor (manual MCP entry)
-
-1. Open **Settings → MCP**
-2. Click **Add Server** and paste:
-
-```json
-{
- "mcpServers": {
- "vibetest": {
- "command": "/full/path/to/vibetest-use/.venv/bin/vibetest-mcp",
- "env": {
- "GOOGLE_API_KEY": "your_api_key"
- }
- }
- }
-}
-```
-
-## Basic Prompts
-```
-> Vibetest my website with 5 agents: browser-use.com
-> Run vibetest on localhost:3000
-> Run a headless vibetest on localhost:8080 with 10 agents
-```
-
-### Parameters
-* **URL** – any `https` or `http` host or `localhost:port`
-* **Agents** – `3` by default; more agents = deeper coverage
-* **Headless** – say *headless* to hide the browser, omit to watch it live
-
-## Requirements
-
-* Python 3.11+
-* Google API key (Gemini flash used for analysis)
-* Cursor / Claude with MCP support
-
-## Source Code
-
-Full implementation: [https://github.com/browser-use/vibetest-use](https://github.com/browser-use/vibetest-use)
diff --git a/docs/examples/templates/fast-agent.mdx b/docs/examples/templates/fast-agent.mdx
deleted file mode 100644
index 1bdd72c98..000000000
--- a/docs/examples/templates/fast-agent.mdx
+++ /dev/null
@@ -1,97 +0,0 @@
----
-title: "Fast Agent"
-description: "Optimize agent performance for maximum speed and efficiency."
-icon: "bolt"
-mode: "wide"
----
-
-```python
-import asyncio
-from dotenv import load_dotenv
-load_dotenv()
-
-from browser_use import Agent, BrowserProfile
-
-# Speed optimization instructions for the model
-SPEED_OPTIMIZATION_PROMPT = """
-Speed optimization instructions:
-- Be extremely concise and direct in your responses
-- Get to the goal as quickly as possible
-- Use multi-action sequences whenever possible to reduce steps
-"""
-
-
-async def main():
- # 1. Use fast LLM - Llama 4 on Groq for ultra-fast inference
- from browser_use import ChatGroq
-
- llm = ChatGroq(
- model='meta-llama/llama-4-maverick-17b-128e-instruct',
- temperature=0.0,
- )
- # from browser_use import ChatGoogle
-
- # llm = ChatGoogle(model='gemini-flash-lite-latest')
-
- # 2. Create speed-optimized browser profile
- browser_profile = BrowserProfile(
- minimum_wait_page_load_time=0.1,
- wait_between_actions=0.1,
- headless=False,
- )
-
- # 3. Define a speed-focused task
- task = """
- 1. Go to reddit https://www.reddit.com/search/?q=browser+agent&type=communities
- 2. Click directly on the first 5 communities to open each in new tabs
- 3. Find out what the latest post is about, and switch directly to the next tab
- 4. Return the latest post summary for each page
- """
-
- # 4. Create agent with all speed optimizations
- agent = Agent(
- task=task,
- llm=llm,
- flash_mode=True, # Disables thinking in the LLM output for maximum speed
- browser_profile=browser_profile,
- extend_system_message=SPEED_OPTIMIZATION_PROMPT,
- )
-
- await agent.run()
-
-
-if __name__ == '__main__':
- asyncio.run(main())
-```
-
-## Speed Optimization Techniques
-
-### 1. Fast LLM Models
-```python
-# Groq - Ultra-fast inference
-from browser_use import ChatGroq
-llm = ChatGroq(model='meta-llama/llama-4-maverick-17b-128e-instruct')
-
-# Google Gemini Flash - Optimized for speed
-from browser_use import ChatGoogle
-llm = ChatGoogle(model='gemini-flash-lite-latest')
-```
-
-### 2. Browser Optimizations
-```python
-browser_profile = BrowserProfile(
- minimum_wait_page_load_time=0.1, # Reduce wait time
- wait_between_actions=0.1, # Faster action execution
- headless=True, # No GUI overhead
-)
-```
-
-### 3. Agent Optimizations
-```python
-agent = Agent(
- task=task,
- llm=llm,
- flash_mode=True, # Skip LLM thinking process
- extend_system_message=SPEED_PROMPT, # Optimize LLM behavior
-)
-```
diff --git a/docs/examples/templates/follow-up-tasks.mdx b/docs/examples/templates/follow-up-tasks.mdx
deleted file mode 100644
index e971341b4..000000000
--- a/docs/examples/templates/follow-up-tasks.mdx
+++ /dev/null
@@ -1,48 +0,0 @@
----
-title: "Follow up tasks"
-description: "Follow up tasks with the same browser session."
-icon: "link"
-mode: "wide"
----
-
-## Chain Agent Tasks
-
-Keep your browser session alive and chain multiple tasks together. Perfect for conversational workflows or multi-step processes.
-
-```python
-from dotenv import load_dotenv
-
-from browser_use import Agent, Browser
-
-
-load_dotenv()
-
-import asyncio
-
-
-async def main():
- browser = Browser(keep_alive=True)
-
- await browser.start()
-
- agent = Agent(task='search for browser-use.', browser_session=browser)
- await agent.run(max_steps=2)
- agent.add_new_task('return the title of first result')
- await agent.run()
-
- await browser.kill()
-
-asyncio.run(main())
-```
-
-## How It Works
-
-1. **Persistent Browser**: `BrowserProfile(keep_alive=True)` prevents browser from closing between tasks
-2. **Task Chaining**: Use `agent.add_new_task()` to add follow-up tasks
-3. **Context Preservation**: Agent maintains memory and browser state across tasks
-4. **Interactive Flow**: Perfect for conversational interfaces
-5. **Break down long flows**: If you have very long flows, you can keep the browser alive and send new agents to it.
-
-
-The browser session remains active throughout the entire chain, preserving all cookies, local storage, and page state.
-
diff --git a/docs/examples/templates/more-examples.mdx b/docs/examples/templates/more-examples.mdx
deleted file mode 100644
index 53d9596dc..000000000
--- a/docs/examples/templates/more-examples.mdx
+++ /dev/null
@@ -1,14 +0,0 @@
----
-title: "More Examples"
-description: "Explore additional examples and use cases on GitHub."
-icon: "arrow-up-right-from-square"
-mode: "wide"
----
-
-### 🔗 Browse All Examples
-
-**[View Complete Examples Directory →](https://github.com/browser-use/browser-use/tree/main/examples)**
-
-### 🤝 Contributing Examples
-
-Have a great use case? **[Submit a pull request](https://github.com/browser-use/browser-use/pulls)** with your example!
diff --git a/docs/examples/templates/parallel-browser.mdx b/docs/examples/templates/parallel-browser.mdx
deleted file mode 100644
index 5a6a3b296..000000000
--- a/docs/examples/templates/parallel-browser.mdx
+++ /dev/null
@@ -1,47 +0,0 @@
----
-title: "Parallel Agents"
-description: "Run multiple agents in parallel with separate browser instances"
-icon: "copy"
----
-
-```python
-import asyncio
-from browser_use import Agent, Browser, ChatOpenAI
-
-async def main():
- # Create 3 separate browser instances
- browsers = [
- Browser(
- user_data_dir=f'./temp-profile-{i}',
- headless=False,
- )
- for i in range(3)
- ]
-
- # Create 3 agents with different tasks
- agents = [
- Agent(
- task='Search for "browser automation" on Google',
- browser=browsers[0],
- llm=ChatOpenAI(model='gpt-4.1-mini'),
- ),
- Agent(
- task='Search for "AI agents" on DuckDuckGo',
- browser=browsers[1],
- llm=ChatOpenAI(model='gpt-4.1-mini'),
- ),
- Agent(
- task='Visit Wikipedia and search for "web scraping"',
- browser=browsers[2],
- llm=ChatOpenAI(model='gpt-4.1-mini'),
- ),
- ]
-
- # Run all agents in parallel
- tasks = [agent.run() for agent in agents]
- results = await asyncio.gather(*tasks, return_exceptions=True)
-
- print('🎉 All agents completed!')
-```
-
-> **Note:** This is experimental, and agents might conflict each other.
diff --git a/docs/examples/templates/playwright-integration.mdx b/docs/examples/templates/playwright-integration.mdx
deleted file mode 100644
index 216a8a944..000000000
--- a/docs/examples/templates/playwright-integration.mdx
+++ /dev/null
@@ -1,393 +0,0 @@
----
-title: "Playwright Integration"
-description: "Advanced example showing Playwright and Browser-Use working together"
-icon: "wand-magic-sparkles"
-mode: "wide"
----
-
-## Key Features
-
-1. Browser-Use and Playwright sharing the same Chrome instance via CDP
-2. Take actions with Playwright and continue with Browser-Use actions
-3. Let the agent call Playwright functions like screenshot or click on selectors for deterministic steps
-
-
-## Installation
-
-```bash
-uv pip install playwright aiohttp
-```
-
-## Full Example
-
-```python
-import asyncio
-import os
-import subprocess
-import sys
-import tempfile
-
-from pydantic import BaseModel, Field
-
-# Check for required dependencies first - before other imports
-try:
- import aiohttp # type: ignore
- from playwright.async_api import Browser, Page, async_playwright # type: ignore
-except ImportError as e:
- print(f'❌ Missing dependencies for this example: {e}')
- print('This example requires: playwright aiohttp')
- print('Install with: uv add playwright aiohttp')
- print('Also run: playwright install chromium')
- sys.exit(1)
-
-from browser_use import Agent, BrowserSession, ChatOpenAI, Tools
-from browser_use.agent.views import ActionResult
-
-# Global Playwright browser instance - shared between custom actions
-playwright_browser: Browser | None = None
-playwright_page: Page | None = None
-
-
-# Custom action parameter models
-class PlaywrightFillFormAction(BaseModel):
- """Parameters for Playwright form filling action."""
-
- customer_name: str = Field(..., description='Customer name to fill')
- phone_number: str = Field(..., description='Phone number to fill')
- email: str = Field(..., description='Email address to fill')
- size_option: str = Field(..., description='Size option (small/medium/large)')
-
-
-class PlaywrightScreenshotAction(BaseModel):
- """Parameters for Playwright screenshot action."""
-
- filename: str = Field(default='playwright_screenshot.png', description='Filename for screenshot')
- quality: int | None = Field(default=None, description='JPEG quality (1-100), only for .jpg/.jpeg files')
-
-
-class PlaywrightGetTextAction(BaseModel):
- """Parameters for getting text using Playwright selectors."""
-
- selector: str = Field(..., description='CSS selector to get text from. Use "title" for page title.')
-
-
-async def start_chrome_with_debug_port(port: int = 9222):
- """
- Start Chrome with remote debugging enabled.
- Returns the Chrome process.
- """
- # Create temporary directory for Chrome user data
- user_data_dir = tempfile.mkdtemp(prefix='chrome_cdp_')
-
- # Chrome launch command
- chrome_paths = [
- '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # macOS
- '/usr/bin/google-chrome', # Linux
- '/usr/bin/chromium-browser', # Linux Chromium
- 'chrome', # Windows/PATH
- 'chromium', # Generic
- ]
-
- chrome_exe = None
- for path in chrome_paths:
- if os.path.exists(path) or path in ['chrome', 'chromium']:
- try:
- # Test if executable works
- test_proc = await asyncio.create_subprocess_exec(
- path, '--version', stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
- )
- await test_proc.wait()
- chrome_exe = path
- break
- except Exception:
- continue
-
- if not chrome_exe:
- raise RuntimeError('❌ Chrome not found. Please install Chrome or Chromium.')
-
- # Chrome command arguments
- cmd = [
- chrome_exe,
- f'--remote-debugging-port={port}',
- f'--user-data-dir={user_data_dir}',
- '--no-first-run',
- '--no-default-browser-check',
- '--disable-extensions',
- 'about:blank', # Start with blank page
- ]
-
- # Start Chrome process
- process = await asyncio.create_subprocess_exec(*cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-
- # Wait for Chrome to start and CDP to be ready
- cdp_ready = False
- for _ in range(20): # 20 second timeout
- try:
- async with aiohttp.ClientSession() as session:
- async with session.get(
- f'http://localhost:{port}/json/version', timeout=aiohttp.ClientTimeout(total=1)
- ) as response:
- if response.status == 200:
- cdp_ready = True
- break
- except Exception:
- pass
- await asyncio.sleep(1)
-
- if not cdp_ready:
- process.terminate()
- raise RuntimeError('❌ Chrome failed to start with CDP')
-
- return process
-
-
-async def connect_playwright_to_cdp(cdp_url: str):
- """
- Connect Playwright to the same Chrome instance Browser-Use is using.
- This enables custom actions to use Playwright functions.
- """
- global playwright_browser, playwright_page
-
- playwright = await async_playwright().start()
- playwright_browser = await playwright.chromium.connect_over_cdp(cdp_url)
-
- # Get or create a page
- if playwright_browser and playwright_browser.contexts and playwright_browser.contexts[0].pages:
- playwright_page = playwright_browser.contexts[0].pages[0]
- elif playwright_browser:
- context = await playwright_browser.new_context()
- playwright_page = await context.new_page()
-
-
-# Create custom tools that use Playwright functions
-tools = Tools()
-
-
-@tools.registry.action(
- "Fill out a form using Playwright's precise form filling capabilities. This uses Playwright selectors for reliable form interaction.",
- param_model=PlaywrightFillFormAction,
-)
-async def playwright_fill_form(params: PlaywrightFillFormAction, browser_session: BrowserSession):
- """
- Custom action that uses Playwright to fill forms with high precision.
- This demonstrates how to create Browser-Use actions that leverage Playwright's capabilities.
- """
- try:
- if not playwright_page:
- return ActionResult(error='Playwright not connected. Run setup first.')
-
- # Filling form with Playwright's precise selectors
-
- # Wait for form to be ready and fill basic fields
- await playwright_page.wait_for_selector('input[name="custname"]', timeout=10000)
- await playwright_page.fill('input[name="custname"]', params.customer_name)
- await playwright_page.fill('input[name="custtel"]', params.phone_number)
- await playwright_page.fill('input[name="custemail"]', params.email)
-
- # Handle size selection - check if it's a select dropdown or radio buttons
- size_select = playwright_page.locator('select[name="size"]')
- size_radio = playwright_page.locator(f'input[name="size"][value="{params.size_option}"]')
-
- if await size_select.count() > 0:
- # It's a select dropdown
- await playwright_page.select_option('select[name="size"]', params.size_option)
- elif await size_radio.count() > 0:
- # It's radio buttons
- await playwright_page.check(f'input[name="size"][value="{params.size_option}"]')
- else:
- raise ValueError(f'Could not find size input field for value: {params.size_option}')
-
- # Get form data to verify it was filled
- form_data = {}
- form_data['name'] = await playwright_page.input_value('input[name="custname"]')
- form_data['phone'] = await playwright_page.input_value('input[name="custtel"]')
- form_data['email'] = await playwright_page.input_value('input[name="custemail"]')
-
- # Get size value based on input type
- if await size_select.count() > 0:
- form_data['size'] = await playwright_page.input_value('select[name="size"]')
- else:
- # For radio buttons, find the checked one
- checked_radio = playwright_page.locator('input[name="size"]:checked')
- if await checked_radio.count() > 0:
- form_data['size'] = await checked_radio.get_attribute('value')
- else:
- form_data['size'] = 'none selected'
-
- success_msg = f'✅ Form filled successfully with Playwright: {form_data}'
-
- return ActionResult(
- extracted_content=success_msg, include_in_memory=True, long_term_memory=f'Filled form with: {form_data}'
- )
-
- except Exception as e:
- error_msg = f'❌ Playwright form filling failed: {str(e)}'
- return ActionResult(error=error_msg)
-
-
-@tools.registry.action(
- "Take a screenshot using Playwright's screenshot capabilities with high quality and precision.",
- param_model=PlaywrightScreenshotAction,
-)
-async def playwright_screenshot(params: PlaywrightScreenshotAction, browser_session: BrowserSession):
- """
- Custom action that uses Playwright's advanced screenshot features.
- """
- try:
- if not playwright_page:
- return ActionResult(error='Playwright not connected. Run setup first.')
-
- # Taking screenshot with Playwright
-
- # Use Playwright's screenshot with full page capture
- screenshot_kwargs = {'path': params.filename, 'full_page': True}
-
- # Add quality parameter only for JPEG files
- if params.quality is not None and params.filename.lower().endswith(('.jpg', '.jpeg')):
- screenshot_kwargs['quality'] = params.quality
-
- await playwright_page.screenshot(**screenshot_kwargs)
-
- success_msg = f'✅ Screenshot saved as {params.filename} using Playwright'
-
- return ActionResult(
- extracted_content=success_msg, include_in_memory=True, long_term_memory=f'Screenshot saved: {params.filename}'
- )
-
- except Exception as e:
- error_msg = f'❌ Playwright screenshot failed: {str(e)}'
- return ActionResult(error=error_msg)
-
-
-@tools.registry.action(
- "Extract text from elements using Playwright's powerful CSS selectors and XPath support.", param_model=PlaywrightGetTextAction
-)
-async def playwright_get_text(params: PlaywrightGetTextAction, browser_session: BrowserSession):
- """
- Custom action that uses Playwright's advanced text extraction with CSS selectors and XPath.
- """
- try:
- if not playwright_page:
- return ActionResult(error='Playwright not connected. Run setup first.')
-
- # Extracting text with Playwright selectors
-
- # Handle special selectors
- if params.selector.lower() == 'title':
- # Use page.title() for title element
- text_content = await playwright_page.title()
- result_data = {
- 'selector': 'title',
- 'text_content': text_content,
- 'inner_text': text_content,
- 'tag_name': 'TITLE',
- 'is_visible': True,
- }
- else:
- # Use Playwright's robust element selection and text extraction
- element = playwright_page.locator(params.selector).first
-
- if await element.count() == 0:
- error_msg = f'❌ No element found with selector: {params.selector}'
- return ActionResult(error=error_msg)
-
- text_content = await element.text_content()
- inner_text = await element.inner_text()
-
- # Get additional element info
- tag_name = await element.evaluate('el => el.tagName')
- is_visible = await element.is_visible()
-
- result_data = {
- 'selector': params.selector,
- 'text_content': text_content,
- 'inner_text': inner_text,
- 'tag_name': tag_name,
- 'is_visible': is_visible,
- }
-
- success_msg = f'✅ Extracted text using Playwright: {result_data}'
-
- return ActionResult(
- extracted_content=str(result_data),
- include_in_memory=True,
- long_term_memory=f'Extracted from {params.selector}: {result_data["text_content"]}',
- )
-
- except Exception as e:
- error_msg = f'❌ Playwright text extraction failed: {str(e)}'
- return ActionResult(error=error_msg)
-
-
-async def main():
- """
- Main function demonstrating Browser-Use + Playwright integration with custom actions.
- """
- print('🚀 Advanced Playwright + Browser-Use Integration with Custom Actions')
-
- chrome_process = None
- try:
- # Step 1: Start Chrome with CDP debugging
- chrome_process = await start_chrome_with_debug_port()
- cdp_url = 'http://localhost:9222'
-
- # Step 2: Connect Playwright to the same Chrome instance
- await connect_playwright_to_cdp(cdp_url)
-
- # Step 3: Create Browser-Use session connected to same Chrome
- browser_session = BrowserSession(cdp_url=cdp_url)
-
- # Step 4: Create AI agent with our custom Playwright-powered tools
- agent = Agent(
- task="""
- Please help me demonstrate the integration between Browser-Use and Playwright:
-
- 1. First, navigate to https://httpbin.org/forms/post
- 2. Use the 'playwright_fill_form' action to fill the form with these details:
- - Customer name: "Alice Johnson"
- - Phone: "555-9876"
- - Email: "alice@demo.com"
- - Size: "large"
- 3. Take a screenshot using the 'playwright_screenshot' action and save it as "form_demo.png"
- 4. Extract the title of the page using 'playwright_get_text' action with selector "title"
- 5. Finally, submit the form and tell me what happened
-
- This demonstrates how Browser-Use AI can orchestrate tasks while using Playwright's precise capabilities for specific operations.
- """,
- llm=ChatOpenAI(model='gpt-4.1-mini'),
- tools=tools, # Our custom tools with Playwright actions
- browser_session=browser_session,
- )
-
- print('🎯 Starting AI agent with custom Playwright actions...')
-
- # Step 5: Run the agent - it will use both Browser-Use actions and our custom Playwright actions
- result = await agent.run()
-
- # Keep browser open briefly to see results
- print(f'✅ Integration demo completed! Result: {result}')
- await asyncio.sleep(2) # Brief pause to see results
-
- except Exception as e:
- print(f'❌ Error: {e}')
- raise
-
- finally:
- # Clean up resources
- if playwright_browser:
- await playwright_browser.close()
-
- if chrome_process:
- chrome_process.terminate()
- try:
- await asyncio.wait_for(chrome_process.wait(), 5)
- except TimeoutError:
- chrome_process.kill()
-
- print('✅ Cleanup complete')
-
-
-if __name__ == '__main__':
- # Run the advanced integration demo
- asyncio.run(main())
-```
diff --git a/docs/examples/templates/secure.mdx b/docs/examples/templates/secure.mdx
deleted file mode 100644
index 8a8af634e..000000000
--- a/docs/examples/templates/secure.mdx
+++ /dev/null
@@ -1,65 +0,0 @@
----
-title: "Secure Setup"
-description: "Azure OpenAI with data privacy and security configuration."
-icon: "shield-check"
-mode: "wide"
----
-
-## Secure Setup with Azure OpenAI
-
-Enterprise-grade security with Azure OpenAI, data privacy protection, and restricted browser access.
-
-```python
-import asyncio
-import os
-from dotenv import load_dotenv
-load_dotenv()
-os.environ['ANONYMIZED_TELEMETRY'] = 'false'
-from browser_use import Agent, BrowserProfile, ChatAzureOpenAI
-
-# Azure OpenAI configuration
-api_key = os.getenv('AZURE_OPENAI_KEY')
-azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
-llm = ChatAzureOpenAI(model='gpt-4.1-mini', api_key=api_key, azure_endpoint=azure_endpoint)
-
-# Secure browser configuration
-browser_profile = BrowserProfile(
- allowed_domains=['*google.com', 'browser-use.com'],
- enable_default_extensions=False
-)
-
-# Sensitive data filtering
-sensitive_data = {'company_name': 'browser-use'}
-
-# Create secure agent
-agent = Agent(
- task='Find the founders of the sensitive company_name',
- llm=llm,
- browser_profile=browser_profile,
- sensitive_data=sensitive_data
-)
-
-async def main():
- await agent.run(max_steps=10)
-
-asyncio.run(main())
-```
-
-## Security Features
-
-**Azure OpenAI:**
-- NOT used to train OpenAI models
-- NOT shared with other customers
-- Hosted entirely within Azure
-- 30-day retention (or zero with Limited Access Program)
-
-**Browser Security:**
-- `allowed_domains`: Restrict navigation to trusted sites
-- `enable_default_extensions=False`: Disable potentially dangerous extensions
-- `sensitive_data`: Filter sensitive information from LLM input
-
-
-
-
-For enterprise deployments contact support@browser-use.com.
-
diff --git a/docs/examples/templates/sensitive-data.mdx b/docs/examples/templates/sensitive-data.mdx
deleted file mode 100644
index 940fd6e64..000000000
--- a/docs/examples/templates/sensitive-data.mdx
+++ /dev/null
@@ -1,46 +0,0 @@
----
-title: "Sensitive Data"
-description: "Handle secret information securely and avoid sending PII & passwords to the LLM."
-icon: "shield"
-mode: "wide"
----
-
-
-```python
-import os
-from browser_use import Agent, Browser, ChatOpenAI
-os.environ['ANONYMIZED_TELEMETRY'] = "false"
-
-
-company_credentials = {'x_user': 'your-real-username@email.com', 'x_pass': 'your-real-password123'}
-
-# Option 1: Secrets available for all websites
-sensitive_data = company_credentials
-
-# Option 2: Secrets per domain with regex
-# sensitive_data = {
-# 'https://*.example-staging.com': company_credentials,
-# 'http*://test.example.com': company_credentials,
-# 'https://example.com': company_credentials,
-# 'https://google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
-# }
-
-
-agent = Agent(
- task='Log into example.com with username x_user and password x_pass',
- sensitive_data=sensitive_data,
- use_vision=False, # Disable vision to prevent LLM seeing sensitive data in screenshots
- llm=ChatOpenAI(model='gpt-4.1-mini'),
-)
-async def main():
-await agent.run()
-```
-
-## How it Works
-1. **Text Filtering**: The LLM only sees placeholders (`x_user`, `x_pass`), we filter your sensitive data from the input text.
-2. **DOM Actions**: Real values are injected directly into form fields after the LLM call
-
-## Best Practices
-- Use `Browser(allowed_domains=[...])` to restrict navigation
-- Set `use_vision=False` to prevent screenshot leaks
-- Use `storage_state='./auth.json'` for login cookies instead of passwords when possible
diff --git a/docs/favicon.ico b/docs/favicon.ico
deleted file mode 100644
index 39e9faefc..000000000
Binary files a/docs/favicon.ico and /dev/null differ
diff --git a/docs/favicon.svg b/docs/favicon.svg
deleted file mode 100644
index 59f98742e..000000000
--- a/docs/favicon.svg
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/docs/images/browser-use-banner-dark.png b/docs/images/browser-use-banner-dark.png
deleted file mode 100644
index 5e49537d8..000000000
Binary files a/docs/images/browser-use-banner-dark.png and /dev/null differ
diff --git a/docs/images/browser-use-banner.png b/docs/images/browser-use-banner.png
deleted file mode 100644
index d2f48a9ef..000000000
Binary files a/docs/images/browser-use-banner.png and /dev/null differ
diff --git a/docs/images/checks-passed.png b/docs/images/checks-passed.png
deleted file mode 100644
index bc4d9ea71..000000000
Binary files a/docs/images/checks-passed.png and /dev/null differ
diff --git a/docs/images/cloud-banner-dark.png b/docs/images/cloud-banner-dark.png
deleted file mode 100644
index 276c5326c..000000000
Binary files a/docs/images/cloud-banner-dark.png and /dev/null differ
diff --git a/docs/images/cloud-banner-js.png b/docs/images/cloud-banner-js.png
deleted file mode 100644
index 7d5f68d4d..000000000
Binary files a/docs/images/cloud-banner-js.png and /dev/null differ
diff --git a/docs/images/cloud-banner-python.png b/docs/images/cloud-banner-python.png
deleted file mode 100644
index 953a6b26c..000000000
Binary files a/docs/images/cloud-banner-python.png and /dev/null differ
diff --git a/docs/images/cloud-banner.png b/docs/images/cloud-banner.png
deleted file mode 100644
index 78bf5e12f..000000000
Binary files a/docs/images/cloud-banner.png and /dev/null differ
diff --git a/docs/images/laminar.png b/docs/images/laminar.png
deleted file mode 100644
index 5d6f3cdf6..000000000
Binary files a/docs/images/laminar.png and /dev/null differ
diff --git a/docs/introduction.mdx b/docs/introduction.mdx
deleted file mode 100644
index 28e7ceda1..000000000
--- a/docs/introduction.mdx
+++ /dev/null
@@ -1,31 +0,0 @@
----
-title: "Introduction"
-description: "Automate browser tasks in plain text. "
-icon: "book-open"
----
-
-
-
-
-
-
- Open-source Python library.
-
-
- Scale up with our cloud.
-
-
-
diff --git a/docs/logo/dark.svg b/docs/logo/dark.svg
deleted file mode 100644
index bce2e5d82..000000000
--- a/docs/logo/dark.svg
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
diff --git a/docs/logo/light.svg b/docs/logo/light.svg
deleted file mode 100644
index 8ac36dcf9..000000000
--- a/docs/logo/light.svg
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
diff --git a/docs/production.mdx b/docs/production.mdx
deleted file mode 100644
index 61f93de9f..000000000
--- a/docs/production.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
----
-title: "Going to Production"
-description: "Deploy your local Browser-Use code to production with `@sandbox` wrapper, and scale to millions of agents"
-icon: "rocket-launch"
----
-
-## 1. Basic Deployment
-
-Wrap your existing local code with `@sandbox()`:
-
-```python
-from browser_use import Browser, sandbox, ChatBrowserUse
-from browser_use.agent.service import Agent
-import asyncio
-
-@sandbox()
-async def my_task(browser: Browser):
- agent = Agent(task="Find the top HN post", browser=browser, llm=ChatBrowserUse())
- await agent.run()
-
-# Just call it like any async function
-asyncio.run(my_task())
-```
-
-
-That's it - your code now runs in production at scale. We handle agents, browsers, persistence, and LLMs.
-
-## 2. Add Proxies for Stealth
-
-Use country-specific proxies to bypass captchas, Cloudflare, and geo-restrictions:
-
-```python
-@sandbox(cloud_proxy_country_code='us') # Route through US proxy
-async def stealth_task(browser: Browser):
- agent = Agent(task="Your task", browser=browser, llm=ChatBrowserUse())
- await agent.run()
-```
-
-## 3. Sync Local Cookies to Cloud
-
-To use your local authentication in production:
-
-**First**, create an API key at [cloud.browser-use.com/new-api-key](https://cloud.browser-use.com/new-api-key) or follow the instruction on [Cloud - Profiles](https://cloud.browser-use.com/dashboard/settings?tab=profiles)
-
-**Then**, sync your local cookies:
-
-```bash
-export BROWSER_USE_API_KEY=your_key && curl -fsSL https://browser-use.com/profile.sh | sh
-```
-
-This opens a browser where you log into your accounts. You'll get a `profile_id`.
-
-**Finally**, use it in production:
-
-```python
-@sandbox(cloud_profile_id='your-profile-id')
-async def authenticated_task(browser: Browser):
- agent = Agent(task="Your authenticated task", browser=browser, llm=ChatBrowserUse())
- await agent.run()
-```
-
-Your cloud browser is already logged in!
-
----
-
-For more sandbox parameters and events, see [Sandbox Quickstart](/customize/sandbox/quickstart).
diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx
deleted file mode 100644
index 560663921..000000000
--- a/docs/quickstart.mdx
+++ /dev/null
@@ -1,163 +0,0 @@
----
-title: "Human Quickstart"
-description: ""
-icon: "rocket"
----
-
-To get started with Browser Use you need to install the package and create an `.env` file with your API key.
-
-
-`ChatBrowserUse` offers the [fastest and most cost-effective models](https://browser-use.com/posts/speed-matters/), completing tasks 3-5x faster. Get started with $10 of [free LLM credits](https://cloud.browser-use.com/new-api-key).
-
-
-## 1. Installing Browser-Use
-
-
-```bash create environment
-pip install uv
-uv venv --python 3.12
-```
-```bash activate environment
-source .venv/bin/activate
-# On Windows use `.venv\Scripts\activate`
-```
-```bash install browser-use & chromium
-uv pip install browser-use
-uvx browser-use install
-```
-
-
-## 2. Choose your favorite LLM
-Create a `.env` file and add your API key.
-
-
-We recommend using ChatBrowserUse which is optimized for browser automation tasks (highest accuracy + fastest speed + lowest token cost). Don't have one? We give you **$10** to try it out [here](https://cloud.browser-use.com/new-api-key).
-
-
-```bash .env
-touch .env
-```
-
-On Windows, use `echo. > .env`
-
-Then add your API key to the file.
-
-
-```bash Browser Use
-# add your key to .env file
-BROWSER_USE_API_KEY=
-# Get 10$ of free credits at https://cloud.browser-use.com/new-api-key
-```
-```bash Google
-# add your key to .env file
-GOOGLE_API_KEY=
-# Get your free Gemini API key from https://aistudio.google.com/app/u/1/apikey?pli=1.
-```
-```bash OpenAI
-# add your key to .env file
-OPENAI_API_KEY=
-```
-```bash Anthropic
-# add your key to .env file
-ANTHROPIC_API_KEY=
-```
-
-
-See [Supported Models](/supported-models) for more.
-
-## 3. Run your first agent
-
-
-```python Browser Use
-from browser_use import Agent, ChatBrowserUse
-from dotenv import load_dotenv
-import asyncio
-
-load_dotenv()
-
-async def main():
- llm = ChatBrowserUse()
- task = "Find the number 1 post on Show HN"
- agent = Agent(task=task, llm=llm)
- await agent.run()
-
-if __name__ == "__main__":
- asyncio.run(main())
-```
-```python Google
-from browser_use import Agent, ChatGoogle
-from dotenv import load_dotenv
-import asyncio
-
-load_dotenv()
-
-async def main():
- llm = ChatGoogle(model="gemini-flash-latest")
- task = "Find the number 1 post on Show HN"
- agent = Agent(task=task, llm=llm)
- await agent.run()
-
-if __name__ == "__main__":
- asyncio.run(main())
-```
-```python OpenAI
-from browser_use import Agent, ChatOpenAI
-from dotenv import load_dotenv
-import asyncio
-
-load_dotenv()
-
-async def main():
- llm = ChatOpenAI(model="gpt-4.1-mini")
- task = "Find the number 1 post on Show HN"
- agent = Agent(task=task, llm=llm)
- await agent.run()
-
-if __name__ == "__main__":
- asyncio.run(main())
-```
-```python Anthropic
-from browser_use import Agent, ChatAnthropic
-from dotenv import load_dotenv
-import asyncio
-
-load_dotenv()
-
-async def main():
- llm = ChatAnthropic(model='claude-sonnet-4-0', temperature=0.0)
- task = "Find the number 1 post on Show HN"
- agent = Agent(task=task, llm=llm)
- await agent.run()
-
-if __name__ == "__main__":
- asyncio.run(main())
-```
-
-
- Custom browsers can be configured in one line. Check out browsers for more.
-
-## 4. Going to Production
-
-Sandboxes are the **easiest way to run Browser-Use in production**. We handle agents, browsers, persistence, auth, cookies, and LLMs. It's also the **fastest way to deploy** - the agent runs right next to the browser, so latency is minimal.
-
-To run in production with authentication, just add `@sandbox` to your function:
-
-```python
-import asyncio
-from browser_use import Browser, sandbox, ChatBrowserUse
-from browser_use.agent.service import Agent
-
-@sandbox(cloud_profile_id='your-profile-id')
-async def production_task(browser: Browser):
- agent = Agent(
- task="Your authenticated task",
- browser=browser,
- llm=ChatBrowserUse(),
- )
- await agent.run()
-
-if __name__ == "__main__":
- asyncio.run(production_task())
-```
-
-See [Going to Production](/production) for how to sync your cookies to the cloud.
diff --git a/docs/quickstart_llm.mdx b/docs/quickstart_llm.mdx
deleted file mode 100644
index 4b8e2b483..000000000
--- a/docs/quickstart_llm.mdx
+++ /dev/null
@@ -1,11 +0,0 @@
----
-title: "LLM Quickstart"
-description: ""
-icon: "brain"
----
-
-
-
-1. Copy all content [🔗 from here](https://github.com/browser-use/browser-use/blob/main/AGENTS.md) (~9k tokens)
-2. Paste it into your project
-3. Prompt your coding agent (Cursor, Claude, etc.) "Help me get started with Browser Use"
diff --git a/docs/supported-models.mdx b/docs/supported-models.mdx
deleted file mode 100644
index a38a28054..000000000
--- a/docs/supported-models.mdx
+++ /dev/null
@@ -1,455 +0,0 @@
----
-title: "Supported Models"
-description: "Choose your favorite LLM"
-icon: "microchip-ai"
-
----
-
-### Browser Use [example](https://github.com/browser-use/browser-use/blob/main/examples/models/browser_use_llm.py)
-
-`ChatBrowserUse()` is our optimized in-house model, matching the accuracy of top models while completing tasks **3-5x** faster. [See our blog post→](https://browser-use.com/posts/speed-matters)
-
-```python
-from browser_use import Agent, ChatBrowserUse
-
-# Initialize the model (defaults to bu-latest)
-llm = ChatBrowserUse()
-
-# Or use the premium model
-llm = ChatBrowserUse(model='bu-2-0')
-
-# Create agent with the model
-agent = Agent(
- task="...", # Your task here
- llm=llm
-)
-```
-
-Required environment variables:
-
-```bash .env
-BROWSER_USE_API_KEY=
-```
-
-Get your API key from the [Browser Use Cloud](https://cloud.browser-use.com/new-api-key). New signups get \$10 free credit via OAuth or \$1 via email.
-
-#### Available Models
-
-- `bu-latest` or `bu-1-0`: Default model
-- `bu-2-0`: Latest premium model with improved capabilities
-
-#### Pricing
-
-ChatBrowserUse offers competitive pricing per 1 million tokens:
-
-**bu-1-0 / bu-latest (Default)**
-
-| Token Type | Price per 1M tokens |
-|------------|---------------------|
-| Input tokens | $0.20 |
-| Cached tokens | $0.02 |
-| Output tokens | $2.00 |
-
-**bu-2-0 (Premium)**
-
-| Token Type | Price per 1M tokens |
-|------------|---------------------|
-| Input tokens | $0.60 |
-| Cached tokens | $0.06 |
-| Output tokens | $3.50 |
-
-
-### Google Gemini [example](https://github.com/browser-use/browser-use/blob/main/examples/models/gemini.py)
-
-
-`GEMINI_API_KEY` is deprecated and should be named `GOOGLE_API_KEY` as of 2025-05.
-
-
-```python
-from browser_use import Agent, ChatGoogle
-from dotenv import load_dotenv
-
-# Read GOOGLE_API_KEY into env
-load_dotenv()
-
-# Initialize the model
-llm = ChatGoogle(model='gemini-flash-latest')
-
-# Create agent with the model
-agent = Agent(
- task="Your task here",
- llm=llm
-)
-```
-
-Required environment variables:
-
-```bash .env
-GOOGLE_API_KEY=
-```
-
-
-### OpenAI [example](https://github.com/browser-use/browser-use/blob/main/examples/models/gpt-4.1.py)
-
-`O3` model is recommended for best accuracy.
-
-```python
-from browser_use import Agent, ChatOpenAI
-
-# Initialize the model
-llm = ChatOpenAI(
- model="o3",
-)
-
-# Create agent with the model
-agent = Agent(
- task="...", # Your task here
- llm=llm
-)
-```
-
-Required environment variables:
-
-```bash .env
-OPENAI_API_KEY=
-```
-
-
- You can use any OpenAI compatible model by passing the model name to the
- `ChatOpenAI` class using a custom URL (or any other parameter that would go
- into the normal OpenAI API call).
-
-
-### Anthropic [example](https://github.com/browser-use/browser-use/blob/main/examples/models/claude-4-sonnet.py)
-
-```python
-from browser_use import Agent, ChatAnthropic
-
-# Initialize the model
-llm = ChatAnthropic(
- model="claude-sonnet-4-0",
-)
-
-# Create agent with the model
-agent = Agent(
- task="...", # Your task here
- llm=llm
-)
-```
-
-And add the variable:
-
-```bash .env
-ANTHROPIC_API_KEY=
-```
-
-### Azure OpenAI [example](https://github.com/browser-use/browser-use/blob/main/examples/models/azure_openai.py)
-
-```python
-from browser_use import Agent, ChatAzureOpenAI
-from pydantic import SecretStr
-import os
-
-# Initialize the model
-llm = ChatAzureOpenAI(
- model="o4-mini",
-)
-
-# Create agent with the model
-agent = Agent(
- task="...", # Your task here
- llm=llm
-)
-```
-
-Required environment variables:
-
-```bash .env
-AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
-AZURE_OPENAI_API_KEY=
-```
-
-#### Using the Responses API (for GPT-5.1 Codex models)
-
-
-Azure OpenAI now requires `api_version >= 2025-03-01-preview` for certain models like `gpt-5.1-codex-mini`.
-These models only support the [Responses API](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/responses) instead of the Chat Completions API.
-
-
-Browser Use automatically detects and uses the Responses API for these models:
-- `gpt-5.1-codex`, `gpt-5.1-codex-mini`, `gpt-5.1-codex-max`
-- `gpt-5-codex`, `codex-mini-latest`
-- `computer-use-preview`
-
-```python
-from browser_use import Agent, ChatAzureOpenAI
-
-# Auto-detection (recommended) - uses Responses API for gpt-5.1-codex-mini
-llm = ChatAzureOpenAI(
- model="gpt-5.1-codex-mini",
- api_version="2025-03-01-preview", # Required for Responses API
-)
-
-# Or explicitly enable/disable Responses API for any model
-llm = ChatAzureOpenAI(
- model="gpt-4o",
- api_version="2025-03-01-preview",
- use_responses_api=True, # Force Responses API (True/False/'auto')
-)
-
-agent = Agent(
- task="...",
- llm=llm
-)
-```
-
-The `use_responses_api` parameter accepts:
-- `'auto'` (default): Automatically uses Responses API for models that require it
-- `True`: Force use of the Responses API
-- `False`: Force use of the Chat Completions API
-
-### AWS Bedrock [example](https://github.com/browser-use/browser-use/blob/main/examples/models/aws.py)
-
-AWS Bedrock provides access to multiple model providers through a single API. We support both a general AWS Bedrock client and provider-specific convenience classes.
-
-#### General AWS Bedrock (supports all providers)
-
-```python
-from browser_use import Agent, ChatAWSBedrock
-
-# Works with any Bedrock model (Anthropic, Meta, AI21, etc.)
-llm = ChatAWSBedrock(
- model="anthropic.claude-3-5-sonnet-20240620-v1:0", # or any Bedrock model
- aws_region="us-east-1",
-)
-
-# Create agent with the model
-agent = Agent(
- task="Your task here",
- llm=llm
-)
-```
-
-#### Anthropic Claude via AWS Bedrock (convenience class)
-
-```python
-from browser_use import Agent, ChatAnthropicBedrock
-
-# Anthropic-specific class with Claude defaults
-llm = ChatAnthropicBedrock(
- model="anthropic.claude-3-5-sonnet-20240620-v1:0",
- aws_region="us-east-1",
-)
-
-# Create agent with the model
-agent = Agent(
- task="Your task here",
- llm=llm
-)
-```
-
-#### AWS Authentication
-
-Required environment variables:
-
-```bash .env
-AWS_ACCESS_KEY_ID=
-AWS_SECRET_ACCESS_KEY=
-AWS_DEFAULT_REGION=us-east-1
-```
-
-You can also use AWS profiles or IAM roles instead of environment variables. The implementation supports:
-
-- Environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_DEFAULT_REGION`)
-- AWS profiles and credential files
-- IAM roles (when running on EC2)
-- Session tokens for temporary credentials
-- AWS SSO authentication (`aws_sso_auth=True`)
-
-## Groq [example](https://github.com/browser-use/browser-use/blob/main/examples/models/llama4-groq.py)
-
-```python
-from browser_use import Agent, ChatGroq
-
-llm = ChatGroq(model="meta-llama/llama-4-maverick-17b-128e-instruct")
-
-agent = Agent(
- task="Your task here",
- llm=llm
-)
-```
-
-Required environment variables:
-
-```bash .env
-GROQ_API_KEY=
-```
-
-## Oracle Cloud Infrastructure (OCI) [example](https://github.com/browser-use/browser-use/blob/main/examples/models/oci_models.py)
-
-OCI provides access to various generative AI models including Meta Llama, Cohere, and other providers through their Generative AI service.
-
-```python
-from browser_use import Agent, ChatOCIRaw
-
-# Initialize the OCI model
-llm = ChatOCIRaw(
- model_id="ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceya...",
- service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
- compartment_id="ocid1.tenancy.oc1..aaaaaaaayeiis5uk2nuubznrekd...",
- provider="meta", # or "cohere"
- temperature=0.7,
- max_tokens=800,
- top_p=0.9,
- auth_type="API_KEY",
- auth_profile="DEFAULT"
-)
-
-# Create agent with the model
-agent = Agent(
- task="Your task here",
- llm=llm
-)
-```
-
-Required setup:
-1. Set up OCI configuration file at `~/.oci/config`
-2. Have access to OCI Generative AI models in your tenancy
-3. Install the OCI Python SDK: `uv add oci` or `pip install oci`
-
-Authentication methods supported:
-- `API_KEY`: Uses API key authentication (default)
-- `INSTANCE_PRINCIPAL`: Uses instance principal authentication
-- `RESOURCE_PRINCIPAL`: Uses resource principal authentication
-
-## Ollama
-
-1. Install Ollama: https://github.com/ollama/ollama
-2. Run `ollama serve` to start the server
-3. In a new terminal, install the model you want to use: `ollama pull llama3.1:8b` (this has 4.9GB)
-
-```python
-from browser_use import Agent, ChatOllama
-
-llm = ChatOllama(model="llama3.1:8b")
-```
-
-## Langchain
-
-[Example](https://github.com/browser-use/browser-use/blob/main/examples/models/langchain) on how to use Langchain with Browser Use.
-
-## Qwen [example](https://github.com/browser-use/browser-use/blob/main/examples/models/qwen.py)
-
-Currently, only `qwen-vl-max` is recommended for Browser Use. Other Qwen models, including `qwen-max`, have issues with the action schema format.
-Smaller Qwen models may return incorrect action schema formats (e.g., `actions: [{"navigate": "google.com"}]` instead of `[{"navigate": {"url": "google.com"}}]`). If you want to use other models, add concrete examples of the correct action format to your prompt.
-
-```python
-from browser_use import Agent, ChatOpenAI
-from dotenv import load_dotenv
-import os
-
-load_dotenv()
-
-# Get API key from https://modelstudio.console.alibabacloud.com/?tab=playground#/api-key
-api_key = os.getenv('ALIBABA_CLOUD')
-base_url = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1'
-
-llm = ChatOpenAI(model='qwen-vl-max', api_key=api_key, base_url=base_url)
-
-agent = Agent(
- task="Your task here",
- llm=llm,
- use_vision=True
-)
-```
-
-Required environment variables:
-
-```bash .env
-ALIBABA_CLOUD=
-```
-
-## ModelScope [example](https://github.com/browser-use/browser-use/blob/main/examples/models/modelscope_example.py)
-
-```python
-from browser_use import Agent, ChatOpenAI
-from dotenv import load_dotenv
-import os
-
-load_dotenv()
-
-# Get API key from https://www.modelscope.cn/docs/model-service/API-Inference/intro
-api_key = os.getenv('MODELSCOPE_API_KEY')
-base_url = 'https://api-inference.modelscope.cn/v1/'
-
-llm = ChatOpenAI(model='Qwen/Qwen2.5-VL-72B-Instruct', api_key=api_key, base_url=base_url)
-
-agent = Agent(
- task="Your task here",
- llm=llm,
- use_vision=True
-)
-```
-
-Required environment variables:
-
-```bash .env
-MODELSCOPE_API_KEY=
-```
-
-### Vercel AI Gateway [example](https://github.com/browser-use/browser-use/blob/main/examples/models/vercel_ai_gateway.py)
-
-Vercel AI Gateway provides an OpenAI-compatible API endpoint that acts as a proxy to various AI providers, with features like rate limiting, caching, and monitoring.
-
-To see all available models, visit: https://ai-gateway.vercel.sh/v1/models
-
-```python
-from browser_use import Agent, ChatVercel
-from dotenv import load_dotenv
-import os
-
-load_dotenv()
-
-# Get API key (https://vercel.com/ai-gateway)
-api_key = os.getenv('VERCEL_API_KEY')
-if not api_key:
- raise ValueError('VERCEL_API_KEY is not set')
-
-# Basic usage
-llm = ChatVercel(
- model='openai/gpt-4o',
- api_key=api_key,
-)
-
-# With provider options - control which providers are used and in what order
-# This will try Vertex AI first, then fall back to Anthropic if Vertex fails
-llm_with_provider_options = ChatVercel(
- model='anthropic/claude-sonnet-4',
- api_key=api_key,
- provider_options={
- 'gateway': {
- 'order': ['vertex', 'anthropic'] # Try Vertex AI first, then Anthropic
- }
- },
-)
-
-agent = Agent(
- task="Your task here",
- llm=llm
-)
-```
-
-Required environment variables:
-
-```bash .env
-VERCEL_API_KEY=
-```
-
-## Other models (DeepSeek, Novita, X...)
-
-We support all other models that can be called via OpenAI compatible API. We are open to PRs for more providers.
-
-**Examples available:**
-- [DeepSeek](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek-chat.py)
-- [Novita](https://github.com/browser-use/browser-use/blob/main/examples/models/novita.py)
-- [OpenRouter](https://github.com/browser-use/browser-use/blob/main/examples/models/openrouter.py)
diff --git a/examples/browser/custom_headers.py b/examples/browser/custom_headers.py
new file mode 100644
index 000000000..d1f4b5cd5
--- /dev/null
+++ b/examples/browser/custom_headers.py
@@ -0,0 +1,104 @@
+"""
+Custom HTTP Headers via a custom Watchdog.
+
+Creates a custom watchdog that listens to TabCreatedEvent and injects
+custom HTTP headers into every new tab using Network.setExtraHTTPHeaders.
+
+Note: The CDP EventRegistry only supports one handler per event method,
+so registering directly on Target.attachedToTarget would replace the
+internal SessionManager handler. Using the browser-use event system
+(TabCreatedEvent) avoids this and fires after the target is fully set up.
+
+Note: Network.setExtraHTTPHeaders is a full replacement (not additive).
+
+Verified by navigating to https://httpbin.org/headers in a new tab.
+"""
+
+import asyncio
+import os
+import sys
+from typing import ClassVar
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from bubus import BaseEvent
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from browser_use import Agent, Browser, ChatBrowserUse
+from browser_use.browser.events import AgentFocusChangedEvent, TabCreatedEvent
+from browser_use.browser.watchdog_base import BaseWatchdog
+
+CUSTOM_HEADERS = {
+ 'X-Custom-Auth': 'Bearer my-secret-token',
+ 'X-Request-Source': 'browser-use-agent',
+ 'X-Trace-Id': 'example-trace-12345',
+}
+
+
+class CustomHeadersWatchdog(BaseWatchdog):
+ """Injects custom HTTP headers on every new tab and focus change.
+
+ Listens to both TabCreatedEvent (new tabs) and AgentFocusChangedEvent
+ (tab switches) because headers are bound to a CDP session, and sessions
+ can be recreated on cross-origin navigations or tab switches.
+ """
+
+ LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [TabCreatedEvent, AgentFocusChangedEvent]
+ EMITS: ClassVar[list[type[BaseEvent]]] = []
+
+ async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
+ """Set extra headers when a new tab is created."""
+ try:
+ await self.browser_session.set_extra_headers(CUSTOM_HEADERS, target_id=event.target_id)
+ except Exception as e:
+ self.logger.debug(f'Could not set headers on {event.target_id[:8]}: {e}')
+
+ async def on_AgentFocusChangedEvent(self, event: AgentFocusChangedEvent) -> None:
+ """Re-apply headers when the agent switches to a different tab."""
+ try:
+ await self.browser_session.set_extra_headers(CUSTOM_HEADERS, target_id=event.target_id)
+ except Exception as e:
+ self.logger.debug(f'Could not set headers on {event.target_id[:8]}: {e}')
+
+
+async def main():
+ browser = Browser(headless=False)
+
+ # Start the browser so watchdogs are initialized
+ await browser.start()
+
+ # Attach our custom watchdog to the browser session
+ CustomHeadersWatchdog.model_rebuild()
+ headers_watchdog = CustomHeadersWatchdog(event_bus=browser.event_bus, browser_session=browser)
+ headers_watchdog.attach_to_session()
+
+ # The watchdog only fires for tabs created AFTER registration.
+ # To apply headers to an already-existing tab, call set_extra_headers():
+ #
+ # await browser.set_extra_headers(CUSTOM_HEADERS)
+ # await browser.set_extra_headers(CUSTOM_HEADERS, target_id=some_target_id)
+ #
+ # Keep in mind that setExtraHTTPHeaders is a full replacement – each
+ # call overwrites all previously set extra headers on that target.
+
+ # Run the agent – open httpbin.org/headers in a new tab so the
+ # watchdog fires and injects the custom headers.
+ agent = Agent(
+ task=(
+ 'Open https://httpbin.org/headers in two different tabs and extract the full JSON response. '
+ 'Look for the custom headers X-Custom-Auth, X-Request-Source, and X-Trace-Id in the output and compare the results.'
+ ),
+ llm=ChatBrowserUse(model='bu-2-0'),
+ browser=browser,
+ )
+
+ result = await agent.run()
+ print(result.final_result())
+
+ await browser.kill()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/browser/real_browser.py b/examples/browser/real_browser.py
index 0342204f0..0405dc430 100644
--- a/examples/browser/real_browser.py
+++ b/examples/browser/real_browser.py
@@ -1,3 +1,7 @@
+"""
+Connect to your existing Chrome browser so it's logged into your websites
+"""
+
import asyncio
import os
import sys
@@ -10,20 +14,30 @@ load_dotenv()
from browser_use import Agent, Browser, ChatGoogle
-# Connect to your existing Chrome browser
-browser = Browser(
- executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
- user_data_dir='~/Library/Application Support/Google/Chrome',
- profile_directory='Default',
-)
+
+def select_chrome_profile() -> str | None:
+ """Prompt user to select a Chrome profile."""
+ profiles = Browser.list_chrome_profiles()
+ if not profiles:
+ return None
+
+ print('Available Chrome profiles:')
+ for i, p in enumerate(profiles, 1):
+ print(f' {i}. {p["name"]}')
+
+ while True:
+ choice = input(f'\nSelect profile (1-{len(profiles)}): ').strip()
+ if choice.isdigit() and 1 <= int(choice) <= len(profiles):
+ return profiles[int(choice) - 1]['directory']
+ print('Invalid choice, try again.')
-# NOTE: You have to close all Chrome browsers before running this example so that we can launch chrome in debug mode.
async def main():
- # save storage state
+ profile = select_chrome_profile()
+ browser = Browser.from_system_chrome(profile_directory=profile)
+
agent = Agent(
llm=ChatGoogle(model='gemini-flash-latest'),
- # Google blocks this approach, so we use a different search engine
task='go to amazon.com and search for pens to draw on whiteboards',
browser=browser,
)
diff --git a/examples/browser/save_cookies.py b/examples/browser/save_cookies.py
index eef6565f5..efbea95c4 100644
--- a/examples/browser/save_cookies.py
+++ b/examples/browser/save_cookies.py
@@ -1,3 +1,10 @@
+"""
+Export cookies and storage state from your real Chrome browser
+
+This allows you to save your authenticated sessions for later use
+without needing to connect to the Chrome profile every time
+"""
+
import asyncio
import os
import sys
@@ -10,17 +17,32 @@ load_dotenv()
from browser_use import Browser
-# Connect to your existing Chrome browser
-browser = Browser(
- executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
- user_data_dir='~/Library/Application Support/Google/Chrome',
- profile_directory='Default',
-)
+
+def select_chrome_profile() -> str | None:
+ """Prompt user to select a Chrome profile."""
+ profiles = Browser.list_chrome_profiles()
+ if not profiles:
+ return None
+
+ print('Available Chrome profiles:')
+ for i, p in enumerate(profiles, 1):
+ print(f' {i}. {p["name"]}')
+
+ while True:
+ choice = input(f'\nSelect profile (1-{len(profiles)}): ').strip()
+ if choice.isdigit() and 1 <= int(choice) <= len(profiles):
+ return profiles[int(choice) - 1]['directory']
+ print('Invalid choice, try again.')
async def main():
+ profile = select_chrome_profile()
+ browser = Browser.from_system_chrome(profile_directory=profile)
+
await browser.start()
- await browser.export_storage_state('storage_state3.json')
+ await browser.export_storage_state('storage_state.json')
+ await browser.stop()
+ print('Storage state exported to storage_state.json')
if __name__ == '__main__':
diff --git a/examples/code_agent/extract_products.py b/examples/code_agent/extract_products.py
deleted file mode 100644
index 03dc9c6a2..000000000
--- a/examples/code_agent/extract_products.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
-Example: Using code-use mode to extract products from multiple pages.
-
-This example demonstrates the new code-use mode, which works like a Jupyter notebook
-where the LLM writes Python code that gets executed in a persistent namespace.
-
-The agent can:
-- Navigate to pages
-- Extract data using JavaScript
-- Combine results from multiple pages
-- Save data to files
-- Export the session as a Jupyter notebook
-
-This solves the problem from the brainstorm where extraction of multiple items
-was difficult with the extract tool alone.
-"""
-
-import asyncio
-
-from lmnr import Laminar
-
-from browser_use.code_use import CodeAgent
-
-Laminar.initialize()
-
-
-async def main():
- task = """
-
-Go to https://www.flipkart.com. Continue collecting products from Flipkart in the following categories. I need approximately 50 products from:\n\n1. Books & Media (books, stationery) - 15 products\n2. Sports & Fitness (equipment, clothing, accessories) - 15 products \n3. Beauty & Personal Care (cosmetics, skincare, grooming) - 10 products\nAnd 2 other categories you find interesting.\nNavigate to these categories and collect products with:\n- Product URL (working link)\n- Product name/description\n- Actual price (MRP)\n- Deal price (current selling price) \n- Discount percentage\n\nFocus on products with good discounts and clear pricing. Target around 40 products total from these three categories.
-
- """
- # Create code-use agent (uses ChatBrowserUse automatically)
- agent = CodeAgent(
- task=task,
- max_steps=30,
- )
-
- try:
- # Run the agent
- print('Running code-use agent...')
- session = await agent.run()
-
- finally:
- await agent.close()
-
-
-if __name__ == '__main__':
- asyncio.run(main())
diff --git a/examples/code_agent/filter_webvoyager_dataset.py b/examples/code_agent/filter_webvoyager_dataset.py
deleted file mode 100644
index d7ddddbf7..000000000
--- a/examples/code_agent/filter_webvoyager_dataset.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import asyncio
-
-from browser_use.code_use import CodeAgent
-
-
-async def main():
- task = """
-Find the WebVoyager dataset, download it and create a new version where you remove all tasks which have older dates than today.
-"""
-
- # Create code-use agent
- agent = CodeAgent(
- task=task,
- max_steps=25,
- )
-
- try:
- # Run the agent
- print('Running code-use agent to filter WebVoyager dataset...')
- session = await agent.run()
-
- finally:
- await agent.close()
-
-
-if __name__ == '__main__':
- asyncio.run(main())
diff --git a/examples/features/csv_file_generation.py b/examples/features/csv_file_generation.py
new file mode 100644
index 000000000..efecc5480
--- /dev/null
+++ b/examples/features/csv_file_generation.py
@@ -0,0 +1,51 @@
+"""
+Generate CSV files with automatic normalization.
+
+The agent's file system automatically normalizes CSV output using Python's csv module,
+so fields containing commas, quotes, or empty values are properly handled per RFC 4180.
+This means the agent doesn't need to worry about manual quoting — it's fixed at the
+infrastructure level.
+
+Common LLM mistakes that are auto-corrected:
+- Unquoted fields containing commas (e.g. "San Francisco, CA" without quotes)
+- Unescaped double quotes inside fields
+- Inconsistent empty field handling
+- Stray blank lines
+"""
+
+import asyncio
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from browser_use import Agent, ChatBrowserUse
+
+
+async def main():
+ agent = Agent(
+ task=(
+ 'Go to https://en.wikipedia.org/wiki/List_of_largest_cities and extract the top 10 cities. '
+ 'Create a CSV file called "top_cities.csv" with columns: rank, city name, country, population. '
+ 'Make sure to include all cities even if some data is missing — leave those cells empty.'
+ ),
+ llm=ChatBrowserUse(model='bu-2-0'),
+ )
+
+ history = await agent.run()
+
+ # Check the generated CSV file
+ if agent.file_system:
+ csv_file = agent.file_system.get_file('top_cities.csv')
+ if csv_file:
+ print('\nGenerated CSV content:')
+ print(csv_file.content)
+ print(f'\nFile saved to: {agent.file_system.get_dir() / csv_file.full_name}')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/save_as_pdf.py b/examples/features/save_as_pdf.py
new file mode 100644
index 000000000..5b453e0d9
--- /dev/null
+++ b/examples/features/save_as_pdf.py
@@ -0,0 +1,46 @@
+"""
+Save any webpage as a PDF using the save_as_pdf action.
+
+The agent can save the current page as a PDF at any point during a task.
+Supports custom filenames, paper sizes (Letter, A4, Legal, A3, Tabloid),
+landscape orientation, and background printing.
+
+Setup:
+1. Get your API key from https://cloud.browser-use.com/new-api-key
+2. Set environment variable: export BROWSER_USE_API_KEY="your-key"
+"""
+
+import asyncio
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from browser_use import Agent, ChatBrowserUse
+
+
+async def main():
+ agent = Agent(
+ task=(
+ 'Go to https://news.ycombinator.com and save the front page as a PDF named "hackernews". '
+ 'Then go to https://en.wikipedia.org/wiki/Web_browser and save just that article as a PDF in A4 format.'
+ ),
+ llm=ChatBrowserUse(model='bu-2-0'),
+ )
+
+ history = await agent.run()
+
+ # Print paths of any PDF files the agent saved
+ print('\nSaved files:')
+ for result in history.action_results():
+ if result.attachments:
+ for path in result.attachments:
+ print(f' {path}')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/models/vercel_ai_gateway.py b/examples/models/vercel_ai_gateway.py
index 8f2f15979..b5d2a3e8b 100644
--- a/examples/models/vercel_ai_gateway.py
+++ b/examples/models/vercel_ai_gateway.py
@@ -6,7 +6,7 @@ requests to various AI providers. This allows you to use Vercel's infrastructure
for rate limiting, caching, and monitoring.
Prerequisites:
-1. Set VERCEL_API_KEY in your environment variables
+1. Set AI_GATEWAY_API_KEY in your environment variables (or rely on VERCEL_OIDC_TOKEN on Vercel)
To see all available models, visit: https://ai-gateway.vercel.sh/v1/models
"""
@@ -20,9 +20,9 @@ from browser_use import Agent, ChatVercel
load_dotenv()
-api_key = os.getenv('VERCEL_API_KEY')
+api_key = os.getenv('AI_GATEWAY_API_KEY') or os.getenv('VERCEL_OIDC_TOKEN')
if not api_key:
- raise ValueError('VERCEL_API_KEY is not set')
+ raise ValueError('AI_GATEWAY_API_KEY or VERCEL_OIDC_TOKEN is not set')
# Basic usage
llm = ChatVercel(
@@ -33,11 +33,37 @@ llm = ChatVercel(
# Example with provider options - control which providers are used and in what order
# This will try Vertex AI first, then fall back to Anthropic if Vertex fails
llm_with_provider_options = ChatVercel(
- model='anthropic/claude-sonnet-4',
+ model='anthropic/claude-sonnet-4.5',
api_key=api_key,
provider_options={
'gateway': {
- 'order': ['vertex', 'anthropic'] # Try Vertex AI first, then Anthropic
+ 'order': ['vertex', 'anthropic'], # Try Vertex AI first, then Anthropic
+ }
+ },
+)
+
+# Example with reasoning and caching enabled, plus model fallbacks
+llm_reasoning_and_fallbacks = ChatVercel(
+ model='anthropic/claude-sonnet-4.5',
+ api_key=api_key,
+ reasoning={
+ 'anthropic': {'thinking': {'type': 'enabled', 'budgetTokens': 2000}},
+ },
+ model_fallbacks=[
+ 'openai/gpt-5.2',
+ 'google/gemini-2.5-flash',
+ ],
+ caching='auto',
+ provider_options={
+ 'gateway': {
+ # Example BYOK configuration; replace with your real keys if needed
+ 'byok': {
+ 'anthropic': [
+ {
+ 'apiKey': os.getenv('ANTHROPIC_API_KEY', ''),
+ }
+ ]
+ },
}
},
)
@@ -52,10 +78,16 @@ agent_with_provider_options = Agent(
llm=llm_with_provider_options,
)
+agent_with_reasoning_and_fallbacks = Agent(
+ task='Go to example.com and summarize the main content with detailed reasoning',
+ llm=llm_reasoning_and_fallbacks,
+)
+
async def main():
await agent.run(max_steps=10)
await agent_with_provider_options.run(max_steps=10)
+ await agent_with_reasoning_and_fallbacks.run(max_steps=10)
if __name__ == '__main__':
diff --git a/pyproject.toml b/pyproject.toml
index cad007fae..356876ecf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
name = "browser-use"
description = "Make websites accessible for AI agents"
authors = [{ name = "Gregor Zunic" }]
-version = "0.11.10a1"
+version = "0.12.6"
readme = "README.md"
requires-python = ">=3.11,<4.0"
classifiers = [
@@ -11,43 +11,41 @@ classifiers = [
"Operating System :: OS Independent",
]
dependencies = [
- "aiohttp>=3.13.3",
- "anyio>=4.9.0",
- "bubus>=1.5.6",
- "click>=8.1.8",
- "InquirerPy>=0.3.4",
- "rich>=14.0.0",
- "google-api-core>=2.25.0",
- "httpx>=0.28.1",
- "portalocker>=2.7.0,<3.0.0",
- "posthog>=3.7.0",
- "psutil>=7.0.0",
- "pydantic>=2.11.5",
- "pyobjc>=11.0; platform_system == 'darwin'",
- "python-dotenv>=1.0.1",
- "requests>=2.32.3",
- "screeninfo>=0.8.1; platform_system != 'darwin'",
- "typing-extensions>=4.12.2",
- "uuid7>=0.1.0",
- "authlib>=1.6.0",
- "google-genai>=1.50.0,<2.0.0",
- "openai>=2.7.2,<3.0.0",
- "anthropic>=0.72.1,<1.0.0",
- "groq>=0.30.0",
- "ollama>=0.5.1",
- "google-api-python-client>=2.174.0",
- "google-auth>=2.40.3",
- "google-auth-oauthlib>=1.2.2",
- "mcp>=1.10.1",
- "pypdf>=5.7.0",
- "reportlab>=4.0.0",
- "cdp-use>=1.4.4",
- "pyotp>=2.9.0",
- "pillow>=11.2.1",
- "cloudpickle>=3.1.1",
- "markdownify>=1.2.0",
- "python-docx>=1.2.0",
- "browser-use-sdk>=2.0.12",
+ "aiohttp==3.13.3",
+ "anyio==4.12.1",
+ "bubus==1.5.6",
+ "click==8.3.1",
+ "InquirerPy==0.3.4",
+ "rich==14.3.1",
+ "google-api-core==2.29.0",
+ "httpx==0.28.1",
+ "posthog==7.7.0",
+ "psutil==7.2.2",
+ "pydantic==2.12.5",
+ "pyobjc==12.1; platform_system == 'darwin'",
+ "python-dotenv==1.2.1",
+ "requests==2.33.0",
+ "screeninfo==0.8.1; platform_system != 'darwin'",
+ "typing-extensions==4.15.0",
+ "uuid7==0.1.0",
+ "google-genai==1.65.0",
+ "openai==2.16.0",
+ "anthropic==0.76.0",
+ "groq==1.0.0",
+ "ollama==0.6.1",
+ "google-api-python-client==2.188.0",
+ "google-auth==2.48.0",
+ "google-auth-oauthlib==1.2.4",
+ "mcp==1.26.0",
+ "pypdf==6.9.1",
+ "reportlab==4.4.9",
+ "cdp-use==1.4.5",
+ "pyotp==2.9.0",
+ "pillow==12.1.1",
+ "cloudpickle==3.1.2",
+ "markdownify==1.2.2",
+ "python-docx==1.2.0",
+ "browser-use-sdk==3.4.2",
]
# google-api-core: only used for Google LLM APIs
# pyperclip: only used for examples that use copy/paste
@@ -60,25 +58,24 @@ dependencies = [
# textual: used for terminal UI
[project.optional-dependencies]
-cli = ["textual>=3.2.0"]
-code = ["matplotlib>=3.9.0", "numpy>=2.3.2", "pandas>=2.2.0", "tabulate>=0.9.0"]
-aws = ["boto3>=1.38.45"]
-oci = ["oci>=2.126.4"]
-video = ["imageio[ffmpeg]>=2.37.0", "numpy>=2.3.2"]
+cli = ["textual==7.4.0"]
+aws = ["boto3==1.42.37"]
+oci = ["oci==2.166.0"]
+video = ["imageio[ffmpeg]==2.37.2", "numpy==2.4.1"]
examples = [
"agentmail==0.0.59",
# botocore: only needed for Bedrock Claude boto3 examples/models/bedrock_claude.py
- "botocore>=1.37.23",
- "imgcat>=0.6.0",
+ "botocore==1.42.37",
+ "imgcat==0.6.0",
# "stagehand-py>=0.3.6",
# "browserbase>=0.4.0",
- "langchain-openai>=0.3.26",
+ "langchain-openai==1.1.7",
]
eval = [
- "lmnr[all]==0.7.17",
- "anyio>=4.9.0",
- "psutil>=7.0.0",
- "datamodel-code-generator>=0.26.0",
+ "lmnr[all]==0.7.42",
+ "anyio==4.12.1",
+ "psutil==7.2.2",
+ "datamodel-code-generator==0.53.0",
]
cli-oci = ["browser-use[cli,oci]"]
all = ["browser-use[cli,examples,aws,oci]"]
@@ -89,7 +86,12 @@ all = ["browser-use[cli,examples,aws,oci]"]
[project.urls]
+Homepage = "https://browser-use.com"
+Documentation = "https://docs.browser-use.com"
Repository = "https://github.com/browser-use/browser-use"
+Telemetry = "https://docs.browser-use.com/development/monitoring/telemetry"
+"Terms of Service" = "https://browser-use.com/legal/terms-of-service"
+"Privacy Policy" = "https://browser-use.com/privacy/"
[project.scripts]
browser-use = "browser_use.skill_cli.main:main" # Fast CLI for browser automation
@@ -167,7 +169,6 @@ include = [
"!browser_use/**/tests/*.py",
"!browser_use/**/tests.py",
"browser_use/agent/system_prompts/*.md",
- "browser_use/code_use/system_prompt.md",
"browser_use/cli_templates/*.py",
"browser_use/py.typed",
"browser_use/dom/**/*.js",
@@ -212,23 +213,23 @@ allow-direct-references = true
# # "sys_platform == 'win32' and platform_machine == 'arm64'", # no pytorch wheels available yet
# ]
dev-dependencies = [
- "ruff>=0.11.2",
- "tokencost>=0.1.16",
- "build>=1.2.2",
- "pytest>=8.3.5",
- "pytest-asyncio>=1.0.0",
- "pytest-httpserver>=1.0.8",
- "fastapi>=0.115.8",
- "inngest>=0.4.19",
- "uvicorn>=0.34.0",
- "ipdb>=0.13.13",
- "pre-commit>=4.2.0",
- "codespell>=2.4.1",
- "pyright>=1.1.403",
- "ty>=0.0.1a1",
- "pytest-xdist>=3.7.0",
- "lmnr[all]==0.7.17",
+ "ruff==0.14.14",
+ "tokencost==0.1.26",
+ "build==1.4.0",
+ "pytest==9.0.2",
+ "pytest-asyncio==1.3.0",
+ "pytest-httpserver==1.1.3",
+ "fastapi==0.128.0",
+ "inngest==0.5.15",
+ "uvicorn==0.40.0",
+ "ipdb==0.13.13",
+ "pre-commit==4.5.1",
+ "codespell==2.4.1",
+ "pyright==1.1.408",
+ "ty==0.0.14",
+ "pytest-xdist==3.8.0",
+ "lmnr[all]==0.7.42",
# "pytest-playwright-asyncio>=0.7.0", # not actually needed I think
- "pytest-timeout>=2.4.0",
- "pydantic_settings>=2.10.1",
+ "pytest-timeout==2.4.0",
+ "pydantic_settings==2.12.0",
]
diff --git a/skills/browser-use/SKILL.md b/skills/browser-use/SKILL.md
index 5203041ac..c9942bb0d 100644
--- a/skills/browser-use/SKILL.md
+++ b/skills/browser-use/SKILL.md
@@ -6,852 +6,229 @@ allowed-tools: Bash(browser-use:*)
# Browser Automation with browser-use CLI
-The `browser-use` command provides fast, persistent browser automation. It maintains browser sessions across commands, enabling complex multi-step workflows.
+The `browser-use` command provides fast, persistent browser automation. A background daemon keeps the browser open across commands, giving ~50ms latency per call.
-## Installation
+## Prerequisites
```bash
-# Run without installing (recommended for one-off use)
-uvx "browser-use[cli]" open https://example.com
-
-# Or install permanently
-uv pip install "browser-use[cli]"
-
-# Install browser dependencies (Chromium)
-browser-use install
+browser-use doctor # Verify installation
```
-## Setup
-
-**One-line install (recommended)**
-```bash
-curl -fsSL https://browser-use.com/cli/install.sh | bash
-```
-
-This interactive installer lets you choose your installation mode and configures everything automatically.
-
-**Installation modes:**
-```bash
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only # Cloud browser only
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --local-only # Local browser only
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full # All modes
-```
-
-| Install Mode | Available Browsers | Default | Use Case |
-|--------------|-------------------|---------|----------|
-| `--remote-only` | remote | remote | Sandboxed agents, CI, no GUI |
-| `--local-only` | chromium, real | chromium | Local development |
-| `--full` | chromium, real, remote | chromium | Full flexibility |
-
-When only one mode is installed, it becomes the default and no `--browser` flag is needed.
-
-**Pass API key during install:**
-```bash
-curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only --api-key bu_xxx
-```
-
-**Verify installation:**
-```bash
-browser-use doctor
-```
-
-**Setup wizard (first-time configuration):**
-```bash
-browser-use setup # Interactive setup
-browser-use setup --mode local # Configure for local browser only
-browser-use setup --mode remote # Configure for cloud browser only
-browser-use setup --mode full # Configure all modes
-browser-use setup --api-key bu_xxx # Set API key during setup
-browser-use setup --yes # Skip interactive prompts
-```
-
-**Generate template files:**
-```bash
-browser-use init # Interactive template selection
-browser-use init --list # List available templates
-browser-use init --template basic # Generate specific template
-browser-use init --output my_script.py # Specify output file
-browser-use init --force # Overwrite existing files
-```
-
-**Manual cloudflared install (for tunneling):**
-```bash
-# macOS:
-brew install cloudflared
-
-# Linux:
-curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o ~/.local/bin/cloudflared && chmod +x ~/.local/bin/cloudflared
-
-# Windows:
-winget install Cloudflare.cloudflared
-```
-
-## Quick Start
-
-```bash
-browser-use open https://example.com # Navigate to URL
-browser-use state # Get page elements with indices
-browser-use click 5 # Click element by index
-browser-use type "Hello World" # Type text
-browser-use screenshot # Take screenshot
-browser-use close # Close browser
-```
+For setup details, see https://github.com/browser-use/browser-use/blob/main/browser_use/skill_cli/README.md
## Core Workflow
-1. **Navigate**: `browser-use open ` - Opens URL (starts browser if needed)
-2. **Inspect**: `browser-use state` - Returns clickable elements with indices
-3. **Interact**: Use indices from state to interact (`browser-use click 5`, `browser-use input 3 "text"`)
-4. **Verify**: `browser-use state` or `browser-use screenshot` to confirm actions
-5. **Repeat**: Browser stays open between commands
+1. **Navigate**: `browser-use open ` — launches headless browser and opens page
+2. **Inspect**: `browser-use state` — returns clickable elements with indices
+3. **Interact**: use indices from state (`browser-use click 5`, `browser-use input 3 "text"`)
+4. **Verify**: `browser-use state` or `browser-use screenshot` to confirm
+5. **Repeat**: browser stays open between commands
+
+If a command fails, run `browser-use close` first to clear any broken session, then retry.
+
+To use the user's existing Chrome (preserves logins/cookies): run `browser-use connect` first.
+To use a cloud browser instead: run `browser-use cloud connect` first.
+After either, commands work the same way.
+
+### If `browser-use connect` fails
+
+When `browser-use connect` cannot find a running Chrome with remote debugging, prompt the user with two options:
+
+1. **Use their real Chrome browser** — they need to enable remote debugging first:
+ - Open `chrome://inspect/#remote-debugging` in Chrome, or relaunch Chrome with `--remote-debugging-port=9222`
+ - Then retry `browser-use connect`
+2. **Use managed Chromium with their Chrome profile** — no Chrome setup needed:
+ - Run `browser-use profile list` to show available profiles
+ - Ask which profile they want, then use `browser-use --profile "ProfileName" open `
+ - This launches a separate Chromium instance with their profile data (cookies, logins, extensions)
+
+Let the user choose — don't assume one path over the other.
## Browser Modes
```bash
-browser-use --browser chromium open # Default: headless Chromium
-browser-use --browser chromium --headed open # Visible Chromium window
-browser-use --browser real open # User's Chrome with login sessions
-browser-use --browser remote open # Cloud browser (requires API key)
+browser-use open # Default: headless Chromium (no setup needed)
+browser-use --headed open # Visible window (for debugging)
+browser-use connect # Connect to user's Chrome (preserves logins/cookies)
+browser-use cloud connect # Cloud browser (zero-config, requires API key)
+browser-use --profile "Default" open # Real Chrome with specific profile
```
-- **chromium**: Fast, isolated, headless by default
-- **real**: Uses your Chrome with cookies, extensions, logged-in sessions
-- **remote**: Cloud-hosted browser with proxy support (requires BROWSER_USE_API_KEY)
+After `connect` or `cloud connect`, all subsequent commands go to that browser — no extra flags needed.
## Commands
-### Navigation
```bash
+# Navigation
browser-use open # Navigate to URL
browser-use back # Go back in history
-browser-use scroll down # Scroll down
+browser-use scroll down # Scroll down (--amount N for pixels)
browser-use scroll up # Scroll up
-browser-use scroll down --amount 1000 # Scroll by specific pixels (default: 500)
-```
+browser-use tab list # List all tabs
+browser-use tab new [url] # Open a new tab (blank or with URL)
+browser-use tab switch # Switch to tab by index
+browser-use tab close [index...] # Close one or more tabs
-### Page State
-```bash
-browser-use state # Get URL, title, and clickable elements
-browser-use screenshot # Take screenshot (outputs base64)
-browser-use screenshot path.png # Save screenshot to file
-browser-use screenshot --full path.png # Full page screenshot
-```
+# Page State — always run state first to get element indices
+browser-use state # URL, title, clickable elements with indices
+browser-use screenshot [path.png] # Screenshot (base64 if no path, --full for full page)
-### Interactions (use indices from `browser-use state`)
-```bash
-browser-use click # Click element
-browser-use type "text" # Type text into focused element
-browser-use input "text" # Click element, then type text
-browser-use keys "Enter" # Send keyboard keys
-browser-use keys "Control+a" # Send key combination
+# Interactions — use indices from state
+browser-use click # Click element by index
+browser-use click # Click at pixel coordinates
+browser-use type "text" # Type into focused element
+browser-use input "text" # Click element, clear existing text, then type
+browser-use input "" # Clear a field without typing new text
+browser-use keys "Enter" # Send keyboard keys (also "Control+a", etc.)
browser-use select "option" # Select dropdown option
-```
-
-### Tab Management
-```bash
-browser-use switch # Switch to tab by index
-browser-use close-tab # Close current tab
-browser-use close-tab # Close specific tab
-```
-
-### JavaScript & Data
-```bash
-browser-use eval "document.title" # Execute JavaScript, return result
-browser-use extract "all product prices" # Extract data using LLM (requires API key)
-```
-
-### Cookies
-```bash
-browser-use cookies get # Get all cookies
-browser-use cookies get --url # Get cookies for specific URL
-browser-use cookies set # Set a cookie
-browser-use cookies set name val --domain .example.com --secure --http-only
-browser-use cookies set name val --same-site Strict # SameSite: Strict, Lax, or None
-browser-use cookies set name val --expires 1735689600 # Expiration timestamp
-browser-use cookies clear # Clear all cookies
-browser-use cookies clear --url # Clear cookies for specific URL
-browser-use cookies export # Export all cookies to JSON file
-browser-use cookies export --url # Export cookies for specific URL
-browser-use cookies import # Import cookies from JSON file
-```
-
-### Wait Conditions
-```bash
-browser-use wait selector "h1" # Wait for element to be visible
-browser-use wait selector ".loading" --state hidden # Wait for element to disappear
-browser-use wait selector "#btn" --state attached # Wait for element in DOM
-browser-use wait text "Success" # Wait for text to appear
-browser-use wait selector "h1" --timeout 5000 # Custom timeout in ms
-```
-
-### Additional Interactions
-```bash
-browser-use hover # Hover over element (triggers CSS :hover)
+browser-use upload # Upload file to file input
+browser-use hover # Hover over element
browser-use dblclick # Double-click element
-browser-use rightclick # Right-click element (context menu)
-```
-
-### Information Retrieval
-```bash
-browser-use get title # Get page title
-browser-use get html # Get full page HTML
-browser-use get html --selector "h1" # Get HTML of specific element
-browser-use get text # Get text content of element
-browser-use get value # Get value of input/textarea
-browser-use get attributes # Get all attributes of element
-browser-use get bbox # Get bounding box (x, y, width, height)
-```
-
-### Python Execution (Persistent Session)
-```bash
-browser-use python "x = 42" # Set variable
-browser-use python "print(x)" # Access variable (outputs: 42)
-browser-use python "print(browser.url)" # Access browser object
-browser-use python --vars # Show defined variables
-browser-use python --reset # Clear Python namespace
-browser-use python --file script.py # Execute Python file
-```
-
-The Python session maintains state across commands. The `browser` object provides:
-- `browser.url` - Current page URL
-- `browser.title` - Page title
-- `browser.html` - Get page HTML
-- `browser.goto(url)` - Navigate
-- `browser.click(index)` - Click element
-- `browser.type(text)` - Type text
-- `browser.input(index, text)` - Click element, then type
-- `browser.keys(keys)` - Send keyboard keys (e.g., "Enter", "Control+a")
-- `browser.screenshot(path)` - Take screenshot
-- `browser.scroll(direction, amount)` - Scroll page
-- `browser.back()` - Go back in history
-- `browser.wait(seconds)` - Sleep/pause execution
-- `browser.extract(query)` - Extract data using LLM
-
-### Agent Tasks (Requires API Key)
-```bash
-browser-use run "Fill the contact form with test data" # Run AI agent
-browser-use run "Extract all product prices" --max-steps 50
-```
-
-Agent tasks use an LLM to autonomously complete complex browser tasks. Requires `BROWSER_USE_API_KEY` or configured LLM API key (OPENAI_API_KEY, ANTHROPIC_API_KEY, etc).
-
-#### Remote Mode Agent Options
-
-When using `--browser remote`, additional options are available:
-
-```bash
-# Basic remote task (uses US proxy by default)
-browser-use -b remote run "Search for AI news"
-
-# Specify LLM model
-browser-use -b remote run "task" --llm gpt-4o
-browser-use -b remote run "task" --llm claude-sonnet-4-20250514
-browser-use -b remote run "task" --llm gemini-2.0-flash
-
-# Proxy configuration (default: us)
-browser-use -b remote run "task" --proxy-country gb # UK proxy
-browser-use -b remote run "task" --proxy-country de # Germany proxy
-
-# Session reuse (run multiple tasks in same browser session)
-browser-use -b remote run "task 1" --keep-alive
-# Returns: session_id: abc-123
-browser-use -b remote run "task 2" --session-id abc-123
-
-# Execution modes
-browser-use -b remote run "task" --no-wait # Async, returns task_id immediately
-browser-use -b remote run "task" --stream # Stream status updates
-browser-use -b remote run "task" --flash # Fast execution mode
-
-# Advanced options
-browser-use -b remote run "task" --thinking # Extended reasoning mode
-browser-use -b remote run "task" --vision # Enable vision (default)
-browser-use -b remote run "task" --no-vision # Disable vision
-browser-use -b remote run "task" --wait # Wait for completion (default: async)
-
-# Use cloud profile (preserves cookies across sessions)
-browser-use -b remote run "task" --profile
-
-# Task configuration
-browser-use -b remote run "task" --start-url https://example.com # Start from specific URL
-browser-use -b remote run "task" --allowed-domain example.com # Restrict navigation (repeatable)
-browser-use -b remote run "task" --metadata key=value # Task metadata (repeatable)
-browser-use -b remote run "task" --secret API_KEY=xxx # Task secrets (repeatable)
-browser-use -b remote run "task" --skill-id skill-123 # Enable skills (repeatable)
-
-# Structured output and evaluation
-browser-use -b remote run "task" --structured-output '{"type":"object"}' # JSON schema for output
-browser-use -b remote run "task" --judge # Enable judge mode
-browser-use -b remote run "task" --judge-ground-truth "expected answer" # Expected answer for judge
-```
-
-### Task Management (Remote Mode)
-
-Manage cloud tasks when using remote mode:
-
-```bash
-browser-use task list # List recent tasks
-browser-use task list --limit 20 # Show more tasks
-browser-use task list --status running # Filter by status
-browser-use task list --session # Filter by session ID
-browser-use task list --json # JSON output
-
-browser-use task status # Get task status (token efficient)
-browser-use task status -c # Show all steps with reasoning
-browser-use task status -v # Show all steps with URLs + actions
-browser-use task status --last 5 # Show only last 5 steps
-browser-use task status --step 3 # Show specific step number
-browser-use task status --reverse # Show steps newest first
-
-browser-use task stop # Stop a running task
-browser-use task logs # Get task execution logs
-```
-
-**Token-efficient monitoring:** Default `task status` shows only the latest step. Use `-c` (compact) or `-v` (verbose) only when you need more context.
-
-### Cloud Session Management (Remote Mode)
-
-Manage cloud browser sessions:
-
-```bash
-browser-use session list # List cloud sessions
-browser-use session list --limit 20 # Show more sessions
-browser-use session list --status active # Filter by status
-browser-use session list --json # JSON output
-
-browser-use session get # Get session details
-browser-use session get --json
-
-browser-use session stop # Stop a session
-browser-use session stop --all # Stop all active sessions
-
-# Create a new cloud session manually
-browser-use session create # Create with defaults
-browser-use session create --profile # With cloud profile
-browser-use session create --proxy-country gb # With geographic proxy
-browser-use session create --start-url https://example.com # Start at URL
-browser-use session create --screen-size 1920x1080 # Custom screen size
-browser-use session create --keep-alive # Keep session alive
-browser-use session create --persist-memory # Persist memory between tasks
-
-# Share session publicly (for collaboration/debugging)
-browser-use session share # Create public share URL
-browser-use session share --delete # Delete public share
-```
-
-## Exposing Local Dev Servers
-
-If you're running a dev server locally and need a cloud browser to reach it, use Cloudflare tunnels:
-
-```bash
-# Start your dev server
-npm run dev & # localhost:3000
-
-# Expose it via Cloudflare tunnel
-browser-use tunnel 3000
-# → url: https://abc.trycloudflare.com
-
-# Now the cloud browser can reach your local server
-browser-use --browser remote open https://abc.trycloudflare.com
-```
-
-**Tunnel commands:**
-```bash
-browser-use tunnel # Start tunnel (returns URL)
-browser-use tunnel # Idempotent - returns existing URL
-browser-use tunnel list # Show active tunnels
-browser-use tunnel stop # Stop tunnel
-browser-use tunnel stop --all # Stop all tunnels
-```
-
-**Note:** Tunnels are independent of browser sessions. They persist across `browser-use close` and can be managed separately.
-
-Cloudflared is installed by `install.sh`. If missing, install manually (see Setup section).
-
-## Running Subagents (Remote Mode)
-
-Cloud sessions and tasks provide a powerful model for running **subagents** - autonomous browser agents that execute tasks in parallel.
-
-### Key Concepts
-
-- **Session = Agent**: Each cloud session is a browser agent with its own state (cookies, tabs, history)
-- **Task = Work**: Tasks are jobs given to an agent. An agent can run multiple tasks sequentially
-- **Parallel agents**: Run multiple sessions simultaneously for parallel work
-- **Session reuse**: While a session is alive, you can assign it more tasks
-- **Session lifecycle**: Once stopped, a session cannot be revived - start a new one
-
-### Basic Subagent Workflow
-
-```bash
-# 1. Start a subagent task (creates new session automatically)
-browser-use -b remote run "Search for AI news and summarize top 3 articles" --no-wait
-# Returns: task_id: task-abc, session_id: sess-123
-
-# 2. Check task progress
-browser-use task status task-abc
-# Shows: Status: running, or finished with output
-
-# 3. View execution logs
-browser-use task logs task-abc
-```
-
-### Running Parallel Subagents
-
-Launch multiple agents to work simultaneously:
-
-```bash
-# Start 3 parallel research agents
-browser-use -b remote run "Research competitor A pricing" --no-wait
-# → task_id: task-1, session_id: sess-a
-
-browser-use -b remote run "Research competitor B pricing" --no-wait
-# → task_id: task-2, session_id: sess-b
-
-browser-use -b remote run "Research competitor C pricing" --no-wait
-# → task_id: task-3, session_id: sess-c
-
-# Monitor all running tasks
-browser-use task list --status running
-# Shows all 3 tasks with their status
-
-# Check individual task results as they complete
-browser-use task status task-1
-browser-use task status task-2
-browser-use task status task-3
-```
-
-### Reusing an Agent for Multiple Tasks
-
-Keep a session alive to run sequential tasks in the same browser context:
-
-```bash
-# Start first task, keep session alive
-browser-use -b remote run "Log into example.com" --keep-alive --no-wait
-# → task_id: task-1, session_id: sess-123
-
-# Wait for login to complete...
-browser-use task status task-1
-# → Status: finished
-
-# Give the same agent another task (reuses login session)
-browser-use -b remote run "Navigate to settings and export data" --session-id sess-123 --no-wait
-# → task_id: task-2, session_id: sess-123 (same session!)
-
-# Agent retains cookies, login state, etc. from previous task
-```
-
-### Managing Active Agents
-
-```bash
-# List all active agents (sessions)
-browser-use session list --status active
-# Shows: sess-123 [active], sess-456 [active], ...
-
-# Get details on a specific agent
-browser-use session get sess-123
-# Shows: status, started time, live URL for viewing
-
-# Stop a specific agent
-browser-use session stop sess-123
-
-# Stop all agents at once
-browser-use session stop --all
-```
-
-### Stopping Tasks vs Sessions
-
-```bash
-# Stop a running task (session may continue if --keep-alive was used)
-browser-use task stop task-abc
-
-# Stop an entire agent/session (terminates all its tasks)
-browser-use session stop sess-123
-```
-
-### Custom Agent Configuration
-
-```bash
-# Default: US proxy, auto LLM selection
-browser-use -b remote run "task" --no-wait
-
-# Explicit configuration
-browser-use -b remote run "task" \
- --llm gpt-4o \
- --proxy-country gb \
- --keep-alive \
- --no-wait
-
-# With cloud profile (preserves cookies across sessions)
-browser-use -b remote run "task" --profile --no-wait
-```
-
-### Monitoring Subagents
-
-**Task status is designed for token efficiency.** Default output is minimal - only expand when needed:
-
-| Mode | Flag | Tokens | Use When |
-|------|------|--------|----------|
-| Default | (none) | Low | Polling progress |
-| Compact | `-c` | Medium | Need full reasoning |
-| Verbose | `-v` | High | Debugging actions |
-
-**Recommended workflow:**
-
-```bash
-# 1. Launch task
-browser-use -b remote run "task" --no-wait
-# → task_id: abc-123
-
-# 2. Poll with default (token efficient) - only latest step
-browser-use task status abc-123
-# ✅ abc-123... [finished] $0.009 15s
-# ... 1 earlier steps
-# 2. I found the information and extracted...
-
-# 3. ONLY IF task failed or need context: use --compact
-browser-use task status abc-123 -c
-
-# 4. ONLY IF debugging specific actions: use --verbose
-browser-use task status abc-123 -v
-```
-
-**For long tasks (50+ steps):**
-```bash
-browser-use task status -c --last 5 # Last 5 steps only
-browser-use task status -c --reverse # Newest first
-browser-use task status -v --step 10 # Inspect specific step
-```
-
-**Live view**: Watch an agent work in real-time:
-```bash
-browser-use session get
-# → Live URL: https://live.browser-use.com?wss=...
-# Open this URL in your browser to watch the agent
-```
-
-**Detect stuck tasks**: If cost/duration stops increasing, the task may be stuck:
-```bash
-browser-use task status
-# 🔄 abc-123... [started] $0.009 45s ← if cost doesn't change, task is stuck
-```
-
-**Logs**: Only available after task completes:
-```bash
-browser-use task logs # Works after task finishes
-```
-
-### Cleanup
-
-Always clean up sessions after parallel work:
-```bash
-# Stop all active agents
-browser-use session stop --all
-
-# Or stop specific sessions
-browser-use session stop
-```
-
-### Troubleshooting Subagents
-
-**Session reuse fails after `task stop`**:
-If you stop a task and try to reuse its session, the new task may get stuck at "created" status. Solution: create a new agent instead.
-```bash
-# This may fail:
-browser-use task stop
-browser-use -b remote run "new task" --session-id # Might get stuck
-
-# Do this instead:
-browser-use -b remote run "new task" --profile # Fresh session
-```
-
-**Task stuck at "started"**:
-- Check cost with `task status` - if not increasing, task is stuck
-- View live URL with `session get` to see what's happening
-- Stop the task and create a new agent
-
-**Sessions persist after tasks complete**:
-Tasks finishing doesn't auto-stop sessions. Clean up manually:
-```bash
-browser-use session list --status active # See lingering sessions
-browser-use session stop --all # Clean up
-```
-
-### Session Management
-```bash
+browser-use rightclick # Right-click element
+
+# Data Extraction
+browser-use eval "js code" # Execute JavaScript, return result
+browser-use get title # Page title
+browser-use get html [--selector "h1"] # Page HTML (or scoped to selector)
+browser-use get text # Element text content
+browser-use get value # Input/textarea value
+browser-use get attributes # Element attributes
+browser-use get bbox