Merge remote-tracking branch 'origin/main' into fix/handle-lmnr-type-error-on-import-4046

This commit is contained in:
Laith Weinberger
2026-04-15 18:33:31 -04:00
246 changed files with 15527 additions and 18254 deletions

View File

@@ -16,6 +16,8 @@ on:
description: Commit hash of the library to build the Cloud eval image for
required: false
permissions: {}
jobs:
trigger_cloud_eval_image_build:
runs-on: ubuntu-latest

View File

@@ -13,6 +13,9 @@ on:
- '.github/workflows/install-script.yml'
workflow_dispatch:
permissions:
contents: read
# Cancel in-progress runs when a new commit is pushed
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -26,16 +29,15 @@ env:
jobs:
# ===========================================================================
# Test install.sh with different modes on all platforms
# Test install.sh on all platforms
# ===========================================================================
test-install-sh-linux:
name: install.sh ${{ matrix.mode }} (Linux ${{ matrix.os }})
name: install.sh (Linux ${{ matrix.os }})
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, ubuntu-22.04]
mode: [--remote-only, --local-only, --full]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
@@ -45,8 +47,8 @@ jobs:
with:
python-version: '3.11'
- name: Run install.sh ${{ matrix.mode }}
run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }}
- name: Run install.sh
run: bash browser_use/skill_cli/install.sh
- name: Add to PATH
run: |
@@ -58,65 +60,31 @@ jobs:
source ~/.browser-use-env/bin/activate
browser-use --help
- name: Verify install-config.json
run: |
cat ~/.browser-use/install-config.json
# Verify expected modes based on install flag
if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then
grep -q '"remote"' ~/.browser-use/install-config.json
grep -q '"default_mode": "remote"' ~/.browser-use/install-config.json
elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then
grep -q '"chromium"' ~/.browser-use/install-config.json
grep -q '"default_mode": "chromium"' ~/.browser-use/install-config.json
elif [[ "${{ matrix.mode }}" == "--full" ]]; then
grep -q '"chromium"' ~/.browser-use/install-config.json
grep -q '"remote"' ~/.browser-use/install-config.json
fi
- name: Verify Chromium installed (local/full only)
if: matrix.mode != '--remote-only'
- name: Verify Chromium installed
run: |
source ~/.browser-use-env/bin/activate
# Check playwright browsers are installed
uvx playwright install --dry-run chromium 2>&1 | grep -i "chromium" || true
# Verify chromium binary exists in playwright cache
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chrome 2>/dev/null || \
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium binary check completed"
- name: Verify cloudflared installed (remote/full only)
if: matrix.mode != '--local-only'
run: |
which cloudflared || ls ~/.local/bin/cloudflared
cloudflared --version
- name: Verify cloudflared NOT installed (local-only)
if: matrix.mode == '--local-only'
run: |
if command -v cloudflared &> /dev/null; then
echo "ERROR: cloudflared should not be installed in local-only mode"
exit 1
fi
echo "Confirmed: cloudflared not installed (expected for local-only)"
- name: Run browser-use doctor
run: |
source ~/.browser-use-env/bin/activate
browser-use doctor
test-install-sh-macos:
name: install.sh ${{ matrix.mode }} (macOS ${{ matrix.os }})
name: install.sh (macOS ${{ matrix.os }})
strategy:
fail-fast: false
matrix:
os: [macos-latest, macos-14]
mode: [--remote-only, --local-only, --full]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Run install.sh ${{ matrix.mode }}
run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }}
- name: Run install.sh
run: bash browser_use/skill_cli/install.sh
- name: Add to PATH
run: |
@@ -128,22 +96,7 @@ jobs:
source ~/.browser-use-env/bin/activate
browser-use --help
- name: Verify install-config.json
run: |
cat ~/.browser-use/install-config.json
if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then
grep -q '"remote"' ~/.browser-use/install-config.json
grep -q '"default_mode": "remote"' ~/.browser-use/install-config.json
elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then
grep -q '"chromium"' ~/.browser-use/install-config.json
grep -q '"default_mode": "chromium"' ~/.browser-use/install-config.json
elif [[ "${{ matrix.mode }}" == "--full" ]]; then
grep -q '"chromium"' ~/.browser-use/install-config.json
grep -q '"remote"' ~/.browser-use/install-config.json
fi
- name: Verify Chromium installed (local/full only)
if: matrix.mode != '--remote-only'
- name: Verify Chromium installed
run: |
source ~/.browser-use-env/bin/activate
# Check playwright cache for chromium
@@ -151,32 +104,13 @@ jobs:
ls ~/Library/Caches/ms-playwright/chromium-*/Chromium.app 2>/dev/null || \
echo "Chromium binary check completed"
- name: Verify cloudflared installed (remote/full only)
if: matrix.mode != '--local-only'
run: |
which cloudflared || ls ~/.local/bin/cloudflared
cloudflared --version
- name: Verify cloudflared NOT installed (local-only)
if: matrix.mode == '--local-only'
run: |
if command -v cloudflared &> /dev/null; then
echo "ERROR: cloudflared should not be installed in local-only mode"
exit 1
fi
echo "Confirmed: cloudflared not installed (expected for local-only)"
- name: Run browser-use doctor
run: |
source ~/.browser-use-env/bin/activate
browser-use doctor
test-install-sh-windows:
name: install.sh ${{ matrix.mode }} (Windows)
strategy:
fail-fast: false
matrix:
mode: [--remote-only, --local-only, --full]
name: install.sh (Windows)
runs-on: windows-latest
defaults:
run:
@@ -192,8 +126,8 @@ jobs:
with:
python-version: '3.11'
- name: Run install.sh ${{ matrix.mode }}
run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }}
- name: Run install.sh
run: bash browser_use/skill_cli/install.sh
- name: Add to PATH
run: |
@@ -205,18 +139,6 @@ jobs:
source ~/.browser-use-env/Scripts/activate
browser-use --help
- name: Verify install-config.json
run: |
cat ~/.browser-use/install-config.json
if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then
grep -q '"remote"' ~/.browser-use/install-config.json
elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then
grep -q '"chromium"' ~/.browser-use/install-config.json
elif [[ "${{ matrix.mode }}" == "--full" ]]; then
grep -q '"chromium"' ~/.browser-use/install-config.json
grep -q '"remote"' ~/.browser-use/install-config.json
fi
- name: Run browser-use doctor
run: |
source ~/.browser-use-env/Scripts/activate
@@ -245,7 +167,7 @@ jobs:
# Install from current branch
uv pip install .
- name: Run browser-use install (installs Chromium only, not cloudflared)
- name: Run browser-use install (installs Chromium)
run: |
source .venv/bin/activate
browser-use install
@@ -262,9 +184,6 @@ jobs:
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium check completed"
# Note: browser-use install only installs Chromium, not cloudflared
# Users should install cloudflared separately if needed for tunneling
- name: Run browser-use doctor
run: |
source .venv/bin/activate
@@ -295,7 +214,6 @@ jobs:
- name: Test uvx with local wheel
run: |
# Install the wheel we just built
WHEEL=$(ls dist/*.whl)
uvx --from "$WHEEL" browser-use --help
@@ -310,8 +228,6 @@ jobs:
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium check completed"
# Note: browser-use install only installs Chromium, not cloudflared
- name: Test uvx browser-use doctor
run: |
WHEEL=$(ls dist/*.whl)
@@ -345,7 +261,5 @@ jobs:
ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \
echo "Chromium check completed"
# Note: browser-use install only installs Chromium, not cloudflared
- name: Test uvx browser-use doctor
run: uvx "browser-use[cli]" doctor

View File

@@ -16,6 +16,9 @@ on:
pull_request:
workflow_dispatch:
permissions:
contents: read
jobs:
lint-syntax:
name: syntax-errors
@@ -35,7 +38,8 @@ jobs:
- uses: astral-sh/setup-uv@v5
with:
enable-cache: true
- run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors
- run: uv python install 3.11
- run: uv sync --dev --all-extras --python 3.11
- run: uv run --no-sync pre-commit run --all-files --show-diff-on-failure
lint-typecheck:

View File

@@ -15,6 +15,9 @@ on:
- '*'
workflow_dispatch:
permissions:
contents: read
jobs:
build:
name: pip-build

View File

@@ -12,7 +12,7 @@ jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v9
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9
with:
# General settings
repo-token: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -1,3 +1,6 @@
default_language_version:
python: python3.11
repos:
- repo: https://github.com/asottile/yesqa
rev: v1.5.0

View File

@@ -36,7 +36,7 @@ uv sync
To get started with Browser Use you need to install the package and create an `.env` file with your API key.
<Note icon="key" color="#FFC107" iconType="regular">
`ChatBrowserUse` offers the [fastest and most cost-effective models](https://browser-use.com/posts/speed-matters/), completing tasks 3-5x faster. Get started with \$10 of [free LLM credits](https://cloud.browser-use.com/new-api-key).
`ChatBrowserUse` offers the [fastest and most cost-effective models](https://browser-use.com/posts/speed-matters/), completing tasks 3-5x faster. Get your API key at [cloud.browser-use.com](https://cloud.browser-use.com/new-api-key).
</Note>
## 1. Installing Browser-Use
@@ -61,7 +61,7 @@ uvx browser-use install
Create a `.env` file and add your API key.
<Callout icon="key" iconType="regular">
We recommend using ChatBrowserUse which is optimized for browser automation tasks (highest accuracy + fastest speed + lowest token cost). Don't have one? We give you **\$10** to try it out [here](https://cloud.browser-use.com/new-api-key).
We recommend using ChatBrowserUse which is optimized for browser automation tasks (highest accuracy + fastest speed + lowest token cost). Get your API key [here](https://cloud.browser-use.com/new-api-key).
</Callout>
```bash .env theme={null}
@@ -76,7 +76,7 @@ Then add your API key to the file.
```bash Browser Use theme={null}
# add your key to .env file
BROWSER_USE_API_KEY=
# Get 10$ of free credits at https://cloud.browser-use.com/new-api-key
# Get your API key at https://cloud.browser-use.com/new-api-key
```
```bash Google theme={null}
@@ -256,7 +256,7 @@ Your cloud browser is already logged in!
***
For more sandbox parameters and events, see [Sandbox Quickstart](https://docs.browser-use.com/customize/sandbox/quickstart).
For more sandbox parameters and events, see [Sandbox Quickstart](https://docs.browser-use.com/legacy/sandbox/quickstart).
# Agent Basics
```python theme={null}
@@ -538,7 +538,7 @@ async def main():
> Complete reference for all browser configuration options
<Note>
The `Browser` instance also provides all [Actor](https://docs.browser-use.com/customize/actor/all-parameters) methods for direct browser control (page management, element interactions, etc.).
The `Browser` instance also provides all [Actor](https://docs.browser-use.com/legacy/actor/all-parameters) methods for direct browser control (page management, element interactions, etc.).
</Note>
## Core Settings
@@ -798,7 +798,7 @@ The agent injects parameters by name matching, so using the wrong name will caus
</Warning>
<Note>
Use `browser_session` parameter in tools for deterministic [Actor](https://docs.browser-use.com/customize/actor/basics) actions.
Use `browser_session` parameter in tools for deterministic [Actor](https://docs.browser-use.com/legacy/actor/basics) actions.
</Note>

View File

@@ -19,7 +19,7 @@ The key product of Browser Use Cloud is the completion of user tasks.
- Profile Sync is the best way to handle authentication for tasks. This feature allows users to upload their local browser cookies (where the user is already logged into the services they need authentication for) to a Browser Profile that can be used for tasks on the cloud. To initiate a Profile Sync, a user must run `export BROWSER_USE_API_KEY=<your_key> && curl -fsSL https://browser-use.com/profile.sh | sh` and follow the steps in the interactive terminal.
## Quickstart
To get started, direct the user to first must create an account, purchase credits (or simply claim the free starter credits given on account creation), and generate an API key on the Browser Use online platform: https://cloud.browser-use.com/. These are the only steps that can only be done on the platform.
To get started, direct the user to first must create an account, purchase credits (or simply claim the five free tasks given on account creation), and generate an API key on the Browser Use online platform: https://cloud.browser-use.com/. These are the only steps that can only be done on the platform.
Avoid giving the user all of the following steps at once as it may seem overwheling. Instead present one step at a time and only continue when asked. Do as much for the user as you are able to.

156
README.md
View File

@@ -13,7 +13,7 @@
</div>
<div align="center">
<a href="https://cloud.browser-use.com"><img src="https://media.browser-use.tools/badges/package" height="48" alt="Browser-Use Package Download Statistics"></a>
<a href="https://cloud.browser-use.com?utm_source=github&utm_medium=readme-badge-downloads"><img src="https://media.browser-use.tools/badges/package" height="48" alt="Browser-Use Package Download Statistics"></a>
</div>
---
@@ -33,12 +33,12 @@
<img width="4 height="1" alt="">
<a href="https://link.browser-use.com/discord"><img src="https://media.browser-use.tools/badges/discord" alt="Discord"></a>
<img width="4" height="1" alt="">
<a href="https://cloud.browser-use.com"><img src="https://media.browser-use.tools/badges/cloud" height="48" alt="Browser-Use Cloud"></a>
<a href="https://cloud.browser-use.com?utm_source=github&utm_medium=readme-badge-cloud"><img src="https://media.browser-use.tools/badges/cloud" height="48" alt="Browser-Use Cloud"></a>
</div>
</br>
🌤️ Want to skip the setup? Use our <b>[cloud](https://cloud.browser-use.com)</b> for faster, scalable, stealth-enabled browser automation!
🌤️ Want to skip the setup? Use our <b>[cloud](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-skip-setup)</b> for faster, scalable, stealth-enabled browser automation!
# 🤖 LLM Quickstart
@@ -49,77 +49,99 @@
# 👋 Human Quickstart
**1. Create environment with [uv](https://docs.astral.sh/uv/) (Python>=3.11):**
**1. Create environment and install Browser-Use with [uv](https://docs.astral.sh/uv/) (Python>=3.11):**
```bash
uv init
uv init && uv add browser-use && uv sync
# uvx browser-use install # Run if you don't have Chromium installed
```
**2. Install Browser-Use package:**
```bash
# We ship every day - use the latest version!
uv add browser-use
uv sync
```
**3. Get your API key from [Browser Use Cloud](https://cloud.browser-use.com/new-api-key) and add it to your `.env` file (new signups get $10 free credits):**
**2. [Optional] Get your API key from [Browser Use Cloud](https://cloud.browser-use.com/new-api-key?utm_source=github&utm_medium=readme-quickstart-api-key):**
```
# .env
BROWSER_USE_API_KEY=your-key
# GOOGLE_API_KEY=your-key
# ANTHROPIC_API_KEY=your-key
```
**4. Install Chromium browser:**
```bash
uvx browser-use install
```
**5. Run your first agent:**
**3. Run your first agent:**
```python
from browser_use import Agent, Browser, ChatBrowserUse
# from browser_use import ChatGoogle # ChatGoogle(model='gemini-3-flash-preview')
# from browser_use import ChatAnthropic # ChatAnthropic(model='claude-sonnet-4-6')
import asyncio
async def example():
async def main():
browser = Browser(
# use_cloud=True, # Uncomment to use a stealth browser on Browser Use Cloud
# use_cloud=True, # Use a stealth browser on Browser Use Cloud
)
llm = ChatBrowserUse()
agent = Agent(
task="Find the number of stars of the browser-use repo",
llm=llm,
llm=ChatBrowserUse(),
# llm=ChatGoogle(model='gemini-3-flash-preview'),
# llm=ChatAnthropic(model='claude-sonnet-4-6'),
browser=browser,
)
history = await agent.run()
return history
await agent.run()
if __name__ == "__main__":
history = asyncio.run(example())
asyncio.run(main())
```
Check out the [library docs](https://docs.browser-use.com) and the [cloud docs](https://docs.cloud.browser-use.com) for more!
Check out the [library docs](https://docs.browser-use.com/open-source/introduction) and the [cloud docs](https://docs.cloud.browser-use.com?utm_source=github&utm_medium=readme-cloud-docs) for more!
<br/>
# 🔥 Deploy on Sandboxes
# Open Source vs Cloud
We handle agents, browsers, persistence, auth, cookies, and LLMs. The agent runs right next to the browser for minimal latency.
<picture>
<source media="(prefers-color-scheme: light)" srcset="static/accuracy_by_model_light.png">
<source media="(prefers-color-scheme: dark)" srcset="static/accuracy_by_model_dark.png">
<img alt="BU Bench V1 - LLM Success Rates" src="static/accuracy_by_model_light.png" width="100%">
</picture>
```python
from browser_use import Browser, sandbox, ChatBrowserUse
from browser_use.agent.service import Agent
import asyncio
We benchmark Browser Use across 100 real-world browser tasks. Full benchmark is open source: **[browser-use/benchmark](https://github.com/browser-use/benchmark)**.
@sandbox()
async def my_task(browser: Browser):
agent = Agent(task="Find the top HN post", browser=browser, llm=ChatBrowserUse())
await agent.run()
**Use the Open-Source Agent**
- You need [custom tools](https://docs.browser-use.com/customize/tools/basics) or deep code-level integration
- We recommend pairing with our [cloud browsers](https://docs.browser-use.com/open-source/customize/browser/remote) for leading stealth, proxy rotation, and scaling
- Or self-host the open-source agent fully on your own machines
# Just call it like any async function
asyncio.run(my_task())
```
**Use the [Fully-Hosted Cloud Agent](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-hosted-agent) (recommended)**
- Much more powerful agent for complex tasks (see plot above)
- Easiest way to start and scale
- Best stealth with proxy rotation and captcha solving
- 1000+ integrations (Gmail, Slack, Notion, and more)
- Persistent filesystem and memory
See [Going to Production](https://docs.browser-use.com/production) for more details.
<br/>
# Demos
### 📋 Form-Filling
#### Task = "Fill in this job application with my resume and information."
![Job Application Demo](https://github.com/user-attachments/assets/57865ee6-6004-49d5-b2c2-6dff39ec2ba9)
[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/apply_to_job.py)
### 🍎 Grocery-Shopping
#### Task = "Put this list of items into my instacart."
https://github.com/user-attachments/assets/a6813fa7-4a7c-40a6-b4aa-382bf88b1850
[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/buy_groceries.py)
### 💻 Personal-Assistant.
#### Task = "Help me find parts for a custom PC."
https://github.com/user-attachments/assets/ac34f75c-057a-43ef-ad06-5b2c9d42bf06
[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/pcpartpicker.py)
### 💡See [more examples here ↗](https://docs.browser-use.com/examples) and give us a star!
<br/>
@@ -170,35 +192,6 @@ curl -o ~/.claude/skills/browser-use/SKILL.md \
<br/>
# Demos
### 📋 Form-Filling
#### Task = "Fill in this job application with my resume and information."
![Job Application Demo](https://github.com/user-attachments/assets/57865ee6-6004-49d5-b2c2-6dff39ec2ba9)
[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/apply_to_job.py)
### 🍎 Grocery-Shopping
#### Task = "Put this list of items into my instacart."
https://github.com/user-attachments/assets/a6813fa7-4a7c-40a6-b4aa-382bf88b1850
[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/buy_groceries.py)
### 💻 Personal-Assistant.
#### Task = "Help me find parts for a custom PC."
https://github.com/user-attachments/assets/ac34f75c-057a-43ef-ad06-5b2c9d42bf06
[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/pcpartpicker.py)
### 💡See [more examples here ↗](https://docs.browser-use.com/examples) and give us a star!
<br/>
## Integrations, hosting, custom tools, MCP, and more on our [Docs ↗](https://docs.browser-use.com)
<br/>
@@ -218,6 +211,15 @@ We optimized **ChatBrowserUse()** specifically for browser automation tasks. On
For other LLM providers, see our [supported models documentation](https://docs.browser-use.com/supported-models).
</details>
<details>
<summary><b>Should I use the Browser Use system prompt with the open-source preview model?</b></summary>
Yes. If you use `ChatBrowserUse(model='browser-use/bu-30b-a3b-preview')` with a normal `Agent(...)`, Browser Use still sends its default agent system prompt for you.
You do **not** need to add a separate custom "Browser Use system message" just because you switched to the open-source preview model. Only use `extend_system_message` or `override_system_message` when you intentionally want to customize the default behavior for your task.
If you want the best default speed/accuracy, we still recommend the newer hosted `bu-*` models. If you want the open-source preview model, the setup stays the same apart from the `model=` value.
</details>
<details>
<summary><b>Can I use custom tools with the agent?</b></summary>
@@ -249,6 +251,12 @@ agent = Agent(
Yes! Browser-Use is open source and free to use. You only need to choose an LLM provider (like OpenAI, Google, ChatBrowserUse, or run local models with Ollama).
</details>
<details>
<summary><b>Terms of Service</b></summary>
This open-source library is licensed under the MIT License. For Browser Use services & data policy, see our [Terms of Service](https://browser-use.com/legal/terms-of-service) and [Privacy Policy](https://browser-use.com/privacy/).
</details>
<details>
<summary><b>How do I handle authentication?</b></summary>
@@ -263,7 +271,7 @@ These examples show how to maintain sessions and handle authentication seamlessl
<details>
<summary><b>How do I solve CAPTCHAs?</b></summary>
For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [Browser Use Cloud](https://cloud.browser-use.com) which provides stealth browsers designed to avoid detection and CAPTCHA challenges.
For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [Browser Use Cloud](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-faq-captcha) which provides stealth browsers designed to avoid detection and CAPTCHA challenges.
</details>
<details>
@@ -271,7 +279,7 @@ For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [B
Chrome can consume a lot of memory, and running many agents in parallel can be tricky to manage.
For production use cases, use our [Browser Use Cloud API](https://cloud.browser-use.com) which handles:
For production use cases, use our [Browser Use Cloud API](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-faq-production) which handles:
- Scalable browser infrastructure
- Memory management
- Proxy rotation

View File

@@ -52,7 +52,6 @@ if TYPE_CHECKING:
from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.browser import BrowserSession as Browser
from browser_use.code_use.service import CodeAgent
from browser_use.dom.service import DomService
from browser_use.llm import models
from browser_use.llm.anthropic.chat import ChatAnthropic
@@ -60,6 +59,7 @@ if TYPE_CHECKING:
from browser_use.llm.browser_use.chat import ChatBrowserUse
from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.groq.chat import ChatGroq
from browser_use.llm.litellm.chat import ChatLiteLLM
from browser_use.llm.mistral.chat import ChatMistral
from browser_use.llm.oci_raw.chat import ChatOCIRaw
from browser_use.llm.ollama.chat import ChatOllama
@@ -72,8 +72,6 @@ if TYPE_CHECKING:
_LAZY_IMPORTS = {
# Agent service (heavy due to dependencies)
# 'Agent': ('browser_use.agent.service', 'Agent'),
# Code-use agent (Jupyter notebook-like execution)
'CodeAgent': ('browser_use.code_use.service', 'CodeAgent'),
'Agent': ('browser_use.agent.service', 'Agent'),
# System prompt (moderate weight due to agent.views imports)
'SystemPrompt': ('browser_use.agent.prompts', 'SystemPrompt'),
@@ -95,6 +93,7 @@ _LAZY_IMPORTS = {
'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'),
'ChatBrowserUse': ('browser_use.llm.browser_use.chat', 'ChatBrowserUse'),
'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'),
'ChatLiteLLM': ('browser_use.llm.litellm.chat', 'ChatLiteLLM'),
'ChatMistral': ('browser_use.llm.mistral.chat', 'ChatMistral'),
'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'),
'ChatOCIRaw': ('browser_use.llm.oci_raw.chat', 'ChatOCIRaw'),
@@ -131,8 +130,6 @@ def __getattr__(name: str):
__all__ = [
'Agent',
'CodeAgent',
# 'CodeAgent',
'BrowserSession',
'Browser', # Alias for BrowserSession
'BrowserProfile',
@@ -148,6 +145,7 @@ __all__ = [
'ChatAnthropic',
'ChatBrowserUse',
'ChatGroq',
'ChatLiteLLM',
'ChatMistral',
'ChatAzureOpenAI',
'ChatOCIRaw',

View File

@@ -8,7 +8,7 @@ from bubus import BaseEvent
from pydantic import Field, field_validator
from uuid_extensions import uuid7str
MAX_STRING_LENGTH = 100000 # 100K chars ~ 25k tokens should be enough
MAX_STRING_LENGTH = 500000 # 100K chars ~ 25k tokens should be enough
MAX_URL_LENGTH = 100000
MAX_TASK_LENGTH = 100000
MAX_COMMENT_LENGTH = 2000
@@ -38,6 +38,8 @@ class UpdateAgentTaskEvent(BaseEvent):
raise ValueError('Agent must have _task_start_time attribute')
done_output = agent.history.final_result() if agent.history else None
if done_output and len(done_output) > MAX_STRING_LENGTH:
done_output = done_output[:MAX_STRING_LENGTH]
return cls(
id=str(agent.task_id),
user_id='', # To be filled by cloud handler

View File

@@ -108,7 +108,6 @@ def create_history_gif(
font_name = os.path.join(CONFIG.WIN_FONT_DIR, font_name + '.ttf')
regular_font = ImageFont.truetype(font_name, font_size)
title_font = ImageFont.truetype(font_name, title_font_size)
goal_font = ImageFont.truetype(font_name, goal_font_size)
font_loaded = True
break
except OSError:
@@ -121,8 +120,6 @@ def create_history_gif(
regular_font = ImageFont.load_default()
title_font = ImageFont.load_default()
goal_font = regular_font
# Load logo if requested
logo = None
if show_logo:
@@ -236,8 +233,6 @@ def _create_task_frame(
# Start with base font size (regular + 16)
base_font_size = regular_font.size + 16
min_font_size = max(regular_font.size - 10, 16) # Don't go below 16pt
max_font_size = base_font_size # Cap at the base font size
# Calculate dynamic font size based on text length and complexity
# Longer texts get progressively smaller fonts
text_length = len(task)

View File

@@ -88,6 +88,8 @@ def construct_judge_messages(
)
)
current_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
# System prompt for judge - conditionally add ground truth section
ground_truth_section = ''
if ground_truth:
@@ -168,7 +170,7 @@ Set `reached_captcha` to true if:
- **evaluate for action** - For each key step of the trace, double check whether the action that the agent tried to performed actually happened. If the required action did not actually occur, the verdict should be false.
- **screenshot is not entire content** - The agent has the entire DOM content, but the screenshot is only part of the content. If the agent extracts information from the page, but you do not see it in the screenshot, you can assume this information is there.
- **Penalize poor tool usage** - Wrong tools, inefficient approaches, ignoring available information.
- **ignore unexpected dates and times** - These agent traces are from varying dates, you can assume the dates the agent uses for search or filtering are correct.
- **current date/time is {current_date}** - content with recent dates is real, not fabricated.
- **IMPORTANT**: be very picky about the user's request - Have very high standard for the agent completing the task exactly to the user's request.
- **IMPORTANT**: be initially doubtful of the agent's self reported success, be sure to verify that its methods are valid and fulfill the user's desires to a tee.
@@ -221,54 +223,3 @@ Evaluate this agent execution given the criteria and respond with the exact JSON
SystemMessage(content=system_prompt),
UserMessage(content=content_parts),
]
def construct_simple_judge_messages(
task: str,
final_result: str,
) -> list[BaseMessage]:
"""Construct lightweight judge messages to validate agent success claims.
Always runs regardless of use_judge setting. Text-only — no screenshots,
no trajectory. Just task + final result.
"""
task_truncated = _truncate_text(task, 20000)
final_result_truncated = _truncate_text(final_result, 20000)
current_date = datetime.now(timezone.utc).strftime('%Y-%m-%d')
system_prompt = f"""You are a strict verifier checking whether a browser automation agent actually completed its task.
Today's date is {current_date}. The agent ran recently — dates near today are expected and NOT fabricated.
Given the task and the agent's final response, determine if the response genuinely satisfies ALL requirements.
Check for these common failure patterns:
1. **Incorrect data**: Wrong number of items, missing filters/criteria, wrong format
2. **Unverified actions**: Agent claims to have submitted a form, posted a comment, or saved a file but there's no evidence
3. **Incomplete results**: Some requirements from the task are not addressed in the response
4. **Fabricated content**: Data that looks plausible but wasn't actually extracted from any page. NOTE: dates and times close to today's date ({current_date}) are NOT fabricated — the agent browses live websites and extracts real-time content.
5. **Partial completion reported as success**: Response acknowledges failure or blockers (captcha, access denied, etc.) but still claims success
Respond with EXACTLY this JSON structure:
{{
"is_correct": true or false,
"reason": "Brief explanation if not correct, empty string if correct"
}}
Be strict: if the response doesn't clearly satisfy every requirement, set is_correct to false."""
user_prompt = f"""<task>
{task_truncated or 'No task provided'}
</task>
<agent_final_response>
{final_result_truncated or 'No response provided'}
</agent_final_response>
Does the agent's response fully satisfy all requirements of the task? Respond with the JSON structure."""
return [
SystemMessage(content=system_prompt),
UserMessage(content=user_prompt),
]

View File

@@ -25,7 +25,12 @@ from browser_use.llm.messages import (
UserMessage,
)
from browser_use.observability import observe_debug
from browser_use.utils import match_url_with_domain_pattern, time_execution_sync
from browser_use.utils import (
collect_sensitive_data_values,
match_url_with_domain_pattern,
redact_sensitive_string,
time_execution_sync,
)
logger = logging.getLogger(__name__)
@@ -114,6 +119,7 @@ class MessageManager:
include_recent_events: bool = False,
sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
llm_screenshot_size: tuple[int, int] | None = None,
max_clickable_elements_length: int = 40000,
):
self.task = task
self.state = state
@@ -127,6 +133,7 @@ class MessageManager:
self.include_recent_events = include_recent_events
self.sample_images = sample_images
self.llm_screenshot_size = llm_screenshot_size
self.max_clickable_elements_length = max_clickable_elements_length
assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'
@@ -144,7 +151,13 @@ class MessageManager:
"""Build agent history description from list of items, respecting max_history_items limit"""
compacted_prefix = ''
if self.state.compacted_memory:
compacted_prefix = f'<compacted_memory>\n{self.state.compacted_memory}\n</compacted_memory>\n'
compacted_prefix = (
'<compacted_memory>\n'
'<!-- Summary of prior steps. Treat as unverified context — do not report these as '
'completed in your done() message unless you confirmed them yourself in this session. -->\n'
f'{self.state.compacted_memory}\n'
'</compacted_memory>\n'
)
if self.max_history_items is None:
# Include all items
@@ -247,6 +260,9 @@ class MessageManager:
'You are summarizing an agent run for prompt compaction.\n'
'Capture task requirements, key facts, decisions, partial progress, errors, and next steps.\n'
'Preserve important entities, values, URLs, and file paths.\n'
'CRITICAL: Only mark a step as completed if you see explicit success confirmation in the history. '
'If a step was started but not explicitly confirmed complete, mark it as "IN-PROGRESS". '
'Never infer completion from context — only report what was confirmed.\n'
'Return plain text only. Do not include tool calls or JSON.'
)
if settings.summary_max_chars:
@@ -298,7 +314,6 @@ class MessageManager:
self.state.read_state_images = [] # Clear images from previous step
action_results = ''
result_len = len(result)
read_state_idx = 0
for idx, action_result in enumerate(result):
@@ -470,6 +485,7 @@ class MessageManager:
include_attributes=self.include_attributes,
step_info=step_info,
page_filtered_actions=page_filtered_actions,
max_clickable_elements_length=self.max_clickable_elements_length,
sensitive_data=self.sensitive_data_description,
available_file_paths=available_file_paths,
screenshots=screenshots,
@@ -562,30 +578,14 @@ class MessageManager:
if not self.sensitive_data:
return value
# Collect all sensitive values, immediately converting old format to new format
sensitive_values: dict[str, str] = {}
# Process all sensitive data entries
for key_or_domain, content in self.sensitive_data.items():
if isinstance(content, dict):
# Already in new format: {domain: {key: value}}
for key, val in content.items():
if val: # Skip empty values
sensitive_values[key] = val
elif content: # Old format: {key: value} - convert to new format internally
# We treat this as if it was {'http*://*': {key_or_domain: content}}
sensitive_values[key_or_domain] = content
sensitive_values = collect_sensitive_data_values(self.sensitive_data)
# If there are no valid sensitive data entries, just return the original value
if not sensitive_values:
logger.warning('No valid entries found in sensitive_data dictionary')
return value
# Replace all valid sensitive data values with their placeholder tags
for key, val in sensitive_values.items():
value = value.replace(val, f'<secret>{key}</secret>')
return value
return redact_sensitive_string(value, sensitive_values)
if isinstance(message.content, str):
message.content = replace_sensitive(message.content)

View File

@@ -157,6 +157,7 @@ class AgentMessagePrompt:
'images': 0,
'interactive_elements': 0,
'total_elements': 0,
'text_chars': 0,
}
if not self.browser_state.dom_state or not self.browser_state.dom_state._root:
@@ -203,6 +204,9 @@ class AgentMessagePrompt:
else:
stats['shadow_open'] += 1
elif original.node_type == NodeType.TEXT_NODE:
stats['text_chars'] += len(original.node_value.strip())
elif original.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM fragment - these are the actual shadow roots
# But don't double-count since we count them at the host level above
@@ -224,6 +228,9 @@ class AgentMessagePrompt:
stats_text = '<page_stats>'
if page_stats['total_elements'] < 10:
stats_text += 'Page appears empty (SPA not loaded?) - '
# Skeleton screen: many elements but almost no text = loading placeholders
elif page_stats['total_elements'] > 20 and page_stats['text_chars'] < page_stats['total_elements'] * 5:
stats_text += 'Page appears to show skeleton/placeholder content (still loading?) - '
stats_text += f'{page_stats["links"]} links, {page_stats["interactive_elements"]} interactive, '
stats_text += f'{page_stats["iframes"]} iframes'
if page_stats['shadow_open'] > 0 or page_stats['shadow_closed'] > 0:
@@ -252,14 +259,11 @@ class AgentMessagePrompt:
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
has_content_above = pages_above > 0
has_content_below = pages_below > 0
total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1)
page_info_text = '<page_info>'
page_info_text += f'{pages_above:.1f} above, '
page_info_text += f'{pages_below:.1f} below '
page_info_text += f'{pages_above:.1f} pages above, {pages_below:.1f} pages below'
if pages_below > 0.2:
page_info_text += ' — scroll down to reveal more content'
page_info_text += '</page_info>\n'
# , at {current_page_position:.0%} of page
if elements_text != '':
if not has_content_above:
elements_text = f'[Start of page]\n{elements_text}'

View File

@@ -36,7 +36,7 @@ from pydantic import BaseModel, ValidationError
from uuid_extensions import uuid7str
from browser_use import Browser, BrowserProfile, BrowserSession
from browser_use.agent.judge import construct_judge_messages, construct_simple_judge_messages
from browser_use.agent.judge import construct_judge_messages
# Lazy import for gif to avoid heavy agent.views import at startup
# from browser_use.agent.gif import create_history_gif
@@ -59,7 +59,6 @@ from browser_use.agent.views import (
JudgementResult,
MessageCompactionSettings,
PlanItem,
SimpleJudgeResult,
StepMetadata,
)
from browser_use.browser.events import _get_timeout
@@ -188,6 +187,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
file_system_path: str | None = None,
task_id: str | None = None,
calculate_cost: bool = False,
pricing_url: str | None = None,
display_files_in_done_text: bool = True,
include_tool_call_examples: bool = False,
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
@@ -204,7 +204,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
loop_detection_enabled: bool = True,
llm_screenshot_size: tuple[int, int] | None = None,
message_compaction: MessageCompactionSettings | bool | None = True,
max_clickable_elements_length: int = 40000,
_url_shortening_limit: int = 25,
enable_signal_handler: bool = True,
**kwargs,
):
# Validate llm_screenshot_size
@@ -409,16 +411,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):
loop_detection_window=loop_detection_window,
loop_detection_enabled=loop_detection_enabled,
message_compaction=message_compaction,
max_clickable_elements_length=max_clickable_elements_length,
)
# Token cost service
self.token_cost_service = TokenCost(include_cost=calculate_cost)
self.token_cost_service = TokenCost(include_cost=calculate_cost, pricing_url=pricing_url)
self.token_cost_service.register_llm(llm)
self.token_cost_service.register_llm(page_extraction_llm)
self.token_cost_service.register_llm(judge_llm)
if self.settings.message_compaction and self.settings.message_compaction.compaction_llm:
self.token_cost_service.register_llm(self.settings.message_compaction.compaction_llm)
# Store signal handler setting (not part of AgentSettings as it's runtime behavior)
self.enable_signal_handler = enable_signal_handler
# Initialize state
self.state = injected_agent_state or AgentState()
@@ -514,6 +520,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
include_recent_events=self.include_recent_events,
sample_images=self.sample_images,
llm_screenshot_size=llm_screenshot_size,
max_clickable_elements_length=self.settings.max_clickable_elements_length,
)
if self.sensitive_data:
@@ -1022,9 +1029,35 @@ class Agent(Generic[Context, AgentStructuredOutput]):
browser_state_summary = None
try:
if self.browser_session:
try:
captcha_wait = await self.browser_session.wait_if_captcha_solving()
if captcha_wait and captcha_wait.waited:
# Reset step timing to exclude the captcha wait from step duration metrics
self.step_start_time = time.time()
duration_s = captcha_wait.duration_ms / 1000
outcome = captcha_wait.result # 'success' | 'failed' | 'timeout'
msg = f'Waited {duration_s:.1f}s for {captcha_wait.vendor} CAPTCHA to be solved. Result: {outcome}.'
self.logger.info(f'🔒 {msg}')
# Inject the outcome so the LLM sees what happened
captcha_result = ActionResult(long_term_memory=msg)
if self.state.last_result:
self.state.last_result.append(captcha_result)
else:
self.state.last_result = [captcha_result]
except Exception as e:
self.logger.warning(f'Phase 0 captcha wait failed (non-fatal): {e}')
# Phase 1: Prepare context and timing
browser_state_summary = await self._prepare_context(step_info)
# Clear previous step state after context preparation (which needs
# them for the "previous action result" prompt) but before the LLM
# call, so a timeout during _get_next_action or _execute_actions
# won't leave stale data from the previous step.
self.state.last_model_output = None
self.state.last_result = None
# Phase 2: Get model output and execute actions
await self._get_next_action(browser_state_summary)
await self._execute_actions()
@@ -1220,12 +1253,31 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.warning(f'{error_msg}')
return
# Handle browser closed/disconnected errors - stop immediately instead of retrying
if self._is_browser_closed_error(error):
self.logger.warning(f'🛑 Browser closed or disconnected: {error}')
self.state.stopped = True
self._external_pause_event.set()
return
# Handle browser closed/disconnected errors
if self._is_connection_like_error(error):
# If reconnection is in progress, wait for it instead of stopping
if self.browser_session.is_reconnecting:
wait_timeout = self.browser_session.RECONNECT_WAIT_TIMEOUT
self.logger.warning(
f'🔄 Connection error during reconnection, waiting up to {wait_timeout}s for reconnect: {error}'
)
try:
await asyncio.wait_for(self.browser_session._reconnect_event.wait(), timeout=wait_timeout)
except TimeoutError:
pass
# Check if reconnection succeeded
if self.browser_session.is_cdp_connected:
self.logger.info('🔄 Reconnection succeeded, retrying step...')
self.state.last_result = [ActionResult(error=f'Connection lost and recovered: {error}')]
return
# Not reconnecting or reconnection failed — check if truly terminal
if self._is_browser_closed_error(error):
self.logger.warning(f'🛑 Browser closed or disconnected: {error}')
self.state.stopped = True
self._external_pause_event.set()
return
# Handle all other exceptions
include_trace = self.logger.isEnabledFor(logging.DEBUG)
@@ -1249,14 +1301,35 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.state.last_result = [ActionResult(error=error_msg)]
return None
def _is_connection_like_error(self, error: Exception) -> bool:
"""Check if the error looks like a CDP/WebSocket connection failure.
Unlike _is_browser_closed_error(), this does NOT check if the CDP client is None
or if reconnection is in progress — it purely looks at the error signature.
"""
error_str = str(error).lower()
return (
isinstance(error, ConnectionError)
or 'websocket connection closed' in error_str
or 'connection closed' in error_str
or 'browser has been closed' in error_str
or 'browser closed' in error_str
or 'no browser' in error_str
)
def _is_browser_closed_error(self, error: Exception) -> bool:
"""Check if the browser has been closed or disconnected.
Only returns True when the error itself is a CDP/WebSocket connection failure
AND the CDP client is gone. Avoids false positives on unrelated errors
(element not found, timeouts, parse errors) that happen to coincide with
a transient None state during reconnects or resets.
AND the CDP client is gone AND we're not actively reconnecting.
Avoids false positives on unrelated errors (element not found, timeouts,
parse errors) that happen to coincide with a transient None state during
reconnects or resets.
"""
# During reconnection, don't treat connection errors as terminal
if self.browser_session.is_reconnecting:
return False
error_str = str(error).lower()
is_connection_error = (
isinstance(error, ConnectionError)
@@ -1504,46 +1577,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self._message_manager._add_context_message(UserMessage(content=msg))
self.AgentOutput = self.DoneAgentOutput
async def _run_simple_judge(self) -> None:
"""Lightweight always-on judge that overrides agent success when it overclaims.
Runs regardless of use_judge setting. Only checks tasks where the agent
claimed success — if the agent already reports failure, there's nothing to correct.
"""
last_result = self.history.history[-1].result[-1]
if not last_result.is_done or not last_result.success:
return
task = self.task
final_result = self.history.final_result() or ''
messages = construct_simple_judge_messages(
task=task,
final_result=final_result,
)
try:
response = await self.llm.ainvoke(messages, output_format=SimpleJudgeResult)
result: SimpleJudgeResult = response.completion # type: ignore[assignment]
if not result.is_correct:
reason = result.reason or 'Task requirements not fully met'
self.logger.info(f'⚠️ Simple judge overriding success to failure: {reason}')
last_result.success = False
note = f'[Simple judge: {reason}]'
# When structured output is expected, don't append judge text to extracted_content
# as it would corrupt the JSON and break end-user parsers
if self.output_model_schema is not None:
if last_result.metadata is None:
last_result.metadata = {}
last_result.metadata['simple_judge'] = note
elif last_result.extracted_content:
last_result.extracted_content += f'\n\n{note}'
else:
last_result.extracted_content = note
except Exception as e:
self.logger.warning(f'Simple judge failed with error: {e}')
# Don't override on error — keep the agent's self-report
@observe(ignore_input=True, ignore_output=False)
async def _judge_trace(self) -> JudgementResult | None:
"""Judge the trace of the agent"""
@@ -1614,8 +1647,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if judgement.failure_reason:
judge_log += f' Failure Reason: {judgement.failure_reason}\n'
if judgement.reached_captcha:
judge_log += ' 🤖 Captcha Detected: Agent encountered captcha challenges\n'
judge_log += ' 👉 🥷 Use Browser Use Cloud for the most stealth browser infra: https://docs.browser-use.com/customize/browser/remote\n'
self.logger.warning(
'Agent was blocked by a captcha. Cloud browsers include stealth fingerprinting and proxy rotation to avoid this.\n'
' Try: Browser(use_cloud=True) | Get an API key: https://cloud.browser-use.com?utm_source=oss&utm_medium=captcha_nudge'
)
judge_log += f' {judgement.reasoning}\n'
self.logger.info(judge_log)
@@ -2023,8 +2058,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if not (self.logger.isEnabledFor(logging.DEBUG) and parsed.action):
return
action_count = len(parsed.action)
# Collect action details
action_details = []
for i, action in enumerate(parsed.action):
@@ -2129,11 +2162,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
has_captcha_issue = any(keyword in final_result_str for keyword in captcha_keywords)
if has_captcha_issue:
# Suggest use_cloud=True for captcha/cloudflare issues
task_preview = self.task[:10] if len(self.task) > 10 else self.task
self.logger.info('')
self.logger.info('Failed because of CAPTCHA? For better browser stealth, try:')
self.logger.info(f' agent = Agent(task="{task_preview}...", browser=Browser(use_cloud=True))')
self.logger.warning(
'Agent was blocked by a captcha. Cloud browsers include stealth fingerprinting and proxy rotation to avoid this.\n'
' Try: Browser(use_cloud=True) | Get an API key: https://cloud.browser-use.com?utm_source=oss&utm_medium=captcha_nudge'
)
# General failure message
self.logger.info('')
@@ -2225,9 +2257,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
await self.step(step_info)
if self.history.is_done():
# Always run simple judge to align agent success with reality
await self._run_simple_judge()
await self.log_completion()
# Run full judge before done callback if enabled
@@ -2424,14 +2453,15 @@ class Agent(Generic[Context, AgentStructuredOutput]):
await self._demo_mode_log(error_msg, 'error', {'step': step + 1})
self.state.consecutive_failures += 1
self.state.last_result = [ActionResult(error=error_msg)]
# Ensure step counter advances on timeout — _finalize() may have
# been skipped or returned early due to the cancellation.
if self.state.n_steps == step + 1:
self.state.n_steps += 1
if on_step_end is not None:
await on_step_end(self)
if self.history.is_done():
# Always run simple judge to align agent success with reality
await self._run_simple_judge()
await self.log_completion()
# Run full judge before done callback if enabled
@@ -2480,6 +2510,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
resume_callback=self.resume,
custom_exit_callback=on_force_exit_log_telemetry, # Pass the new telemetrycallback
exit_on_second_int=True,
disabled=not self.enable_signal_handler,
)
signal_handler.register()
@@ -2672,7 +2703,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
to pre-action values. Any change aborts the remaining queue.
"""
results: list[ActionResult] = []
time_elapsed = 0
total_actions = len(actions)
assert self.browser_session is not None, 'BrowserSession is not set up'
@@ -2682,19 +2712,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):
and self.browser_session._cached_browser_state_summary.dom_state is not None
):
cached_selector_map = dict(self.browser_session._cached_browser_state_summary.dom_state.selector_map)
cached_element_hashes = {e.parent_branch_hash() for e in cached_selector_map.values()}
else:
cached_selector_map = {}
cached_element_hashes = set()
except Exception as e:
self.logger.error(f'Error getting cached selector map: {e}')
cached_selector_map = {}
cached_element_hashes = set()
for i, action in enumerate(actions):
# Get action name from the action model BEFORE try block to ensure it's always available in except
action_data = action.model_dump(exclude_unset=True)
action_name = next(iter(action_data.keys())) if action_data else 'unknown'
if i > 0:
# ONLY ALLOW TO CALL `done` IF IT IS A SINGLE ACTION
if action.model_dump(exclude_unset=True).get('done') is not None:
if action_data.get('done') is not None:
msg = f'Done action is allowed only as a single action - stopped after action {i} / {total_actions}.'
self.logger.debug(msg)
break
@@ -2706,9 +2737,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
try:
await self._check_stop_or_pause()
# Get action name from the action model
action_data = action.model_dump(exclude_unset=True)
action_name = next(iter(action_data.keys())) if action_data else 'unknown'
# Log action before execution
await self._log_action(action, action_name, i + 1, total_actions)
@@ -2717,8 +2745,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
pre_action_url = await self.browser_session.get_current_page_url()
pre_action_focus = self.browser_session.agent_focus_target_id
time_start = time.time()
result = await self.tools.act(
action=action,
browser_session=self.browser_session,
@@ -2729,9 +2755,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
extraction_schema=self.extraction_schema,
)
time_end = time.time()
time_elapsed = time_end - time_start
if result.error:
await self._demo_mode_log(
f'Action "{action_name}" failed: {result.error}',
@@ -3429,7 +3452,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
hist_node = historical_elem.node_name.lower() if historical_elem else ''
similar_elements = []
if historical_elem and historical_elem.attributes:
hist_aria = historical_elem.attributes.get('aria-label', '')
for idx, elem in selector_map.items():
if elem.node_name.lower() == hist_node and elem.attributes:
elem_aria = elem.attributes.get('aria-label', '')
@@ -3911,6 +3933,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Kill the browser session - this dispatches BrowserStopEvent,
# stops the EventBus with clear=True, and recreates a fresh EventBus
await self.browser_session.kill()
else:
# keep_alive=True sessions shouldn't keep the event loop alive after agent.run()
await self.browser_session.event_bus.stop(
clear=False,
timeout=_get_timeout('TIMEOUT_BrowserSessionEventBusStopOnAgentClose', 1.0),
)
try:
self.browser_session.event_bus.event_queue = None
self.browser_session.event_bus._on_idle = None
except Exception:
pass
# Close skill service if configured
if self.skill_service is not None:

View File

@@ -40,18 +40,25 @@ USER REQUEST: This is your ultimate objective and always remains visible.
1. Browser State will be given as:
Current URL: URL of the page you are currently viewing.
Open Tabs: Open tabs with their ids.
Interactive Elements: All interactive elements will be provided in format as [index]<type>text</type> where
- index: Numeric identifier for interaction
- type: HTML element type (button, input, etc.)
- text: Element description
Interactive Elements: All interactive elements will be provided in a tree-style XML format:
- Format: `[index]<tagname attribute=value />` for interactive elements
- Text content appears as child nodes on separate lines (not inside tags)
- Indentation with tabs shows parent/child relationships
Examples:
[33]<div>User form</div>
\t*[35]<button aria-label='Submit form'>Submit</button>
[33]<div />
User form
[35]<input type=text placeholder=Enter name />
*[38]<button aria-label=Submit form />
Submit
[40]<a />
About us
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
- Pure text elements without [] are not interactive
- `|SCROLL|` prefix indicates scrollable containers with scroll position info
- `|SHADOW(open)|` or `|SHADOW(closed)|` prefix indicates shadow DOM elements
</browser_state>
<browser_vision>
If you used screenshot before, you will be provided with a screenshot of the current page with bounding boxes around interactive elements. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
@@ -65,14 +72,14 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed.
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). Do not spend more than 3-4 steps on a single captcha - if blocked, try alternative approaches or report the limitation.
- CAPTCHAs are automatically solved by the browser. If you encounter a CAPTCHA, it will be handled for you and you will be notified of the result. Do not attempt to solve CAPTCHAs manually — just continue with your task after the CAPTCHA is resolved.
- If the page is not fully loaded, use the wait action.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- Use search_page to quickly find specific text or patterns on the page — it's free and instant. Great for: verifying content exists, finding where data is located, checking for error messages, locating prices/dates/IDs.
- Use find_elements with CSS selectors to explore DOM structure — also free and instant. Great for: counting items (e.g. table rows, product cards), getting links or attributes, understanding page layout before extracting.
- Prefer search_page and find_elements over scrolling when looking for specific content not visible in browser_state.
- Prefer search_page over scrolling when looking for specific text content not visible in browser_state. Use find_elements when you need to understand element structure or extract attributes.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., ALWAYS look for filter/sort options FIRST before browsing results. Apply all relevant filters before scrolling through results.
@@ -84,7 +91,7 @@ Strictly follow these rules while using the browser and navigating the web:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
- If you get stuck e.g. with logins in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search. CAPTCHAs are handled automatically.
- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in <available_file_paths>. You can either read the file or scroll in the page to see more.
- Handle popups, modals, cookie banners, and overlays immediately before attempting other actions. Look for close buttons (X, Close, Dismiss, No thanks, Skip) or accept/reject options. If a popup blocks interaction with the main page, handle it first.
- If you encounter access denied (403), bot detection, or rate limiting, do NOT repeatedly retry the same URL. Try alternative approaches or report the limitation.
@@ -138,9 +145,9 @@ BEFORE calling `done` with `success=true`, you MUST perform this verification:
3. **Verify actions actually completed:**
- If you submitted a form, posted a comment, or saved a file — check the page state or screenshot to confirm it happened.
- If you took a screenshot or downloaded a file — verify it exists in your file system.
4. **Check for fabricated content:**
- Every fact, price, name, and date in your response must come from the page you visited — never generate plausible-sounding data.
5. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
4. **Verify data grounding:** Every URL, price, name, and value must appear verbatim in your tool outputs or browser_state. Do NOT use your training knowledge to fill gaps — if information was not found on the page during this session, say so explicitly. Never fabricate or invent values.
5. **Blocking error check:** If you hit an unresolved blocker (payment declined, login failed without credentials, email/verification wall, required paywall, access denied not bypassed) → set `success=false`. Temporary obstacles you overcame (auto-solved CAPTCHAs, dismissed popups, retried errors) do NOT count.
6. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
Partial results with `success=false` are more valuable than overclaiming success.
</pre_done_verification>
</task_completion_rules>
@@ -154,9 +161,11 @@ Check the browser state each step to verify your previous action achieved its go
You can output multiple actions in one step. Try to be efficient where it makes sense. Do not predict actions which do not make sense for the current page.
**Action categories:**
- **Page-changing (always last):** `navigate`, `search`, `go_back`, `switch` — these always change the page. Remaining actions after them are skipped automatically.
- **Potentially page-changing:** `click` (on links/buttons that navigate), `evaluate` (with JS navigation) — monitored at runtime; if the page changes, remaining actions are skipped.
- **Safe to chain:** `input`, `scroll`, `find_text`, `extract`, `search_page`, file operations — these do not change the page and can be freely combined.
- **Page-changing (always last):** `navigate`, `search`, `go_back`, `switch`, `evaluate` — these always change the page. Remaining actions after them are skipped automatically. Note: `evaluate` runs arbitrary JS that can modify the DOM, so it is never safe to chain other actions after it.
- **Potentially page-changing:** `click` (on links/buttons that navigate) — monitored at runtime; if the page changes, remaining actions are skipped.
- **Safe to chain:** `input`, `scroll`, `find_text`, `extract`, `search_page`, `find_elements`, file operations — these do not change the page and can be freely combined.
**Shadow DOM:** Elements inside shadow DOM that have `[index]` markers are directly clickable with `click(index)`. Do NOT use `evaluate` to click them.
**Recommended combinations:**
- `input` + `input` + `input` + `click` → Fill multiple form fields then submit
@@ -239,7 +248,7 @@ Action list should NEVER be empty.
3. ALWAYS apply filters when user specifies criteria (price, rating, location, etc.)
4. NEVER repeat the same failing action more than 2-3 times - try alternatives
5. NEVER assume success - always verify from screenshot or browser state
6. If blocked by captcha/login/403, try alternative approaches rather than retrying
6. CAPTCHAs are solved automatically. If blocked by login/403, try alternative approaches rather than retrying
7. Put ALL relevant findings in done action's text field
8. Match user's requested output format exactly
9. Track progress in memory to avoid loops
@@ -253,7 +262,7 @@ When encountering errors or unexpected states:
2. Check if a popup, modal, or overlay is blocking interaction
3. If an element is not found, scroll to reveal more content
4. If an action fails repeatedly (2-3 times), try an alternative approach
5. If blocked by login/captcha/403, consider alternative sites or search engines
5. If blocked by login/403, consider alternative sites or search engines. CAPTCHAs are solved automatically.
6. If the page structure is different than expected, re-analyze and adapt
7. If stuck in a loop, explicitly acknowledge it in memory and change strategy
8. If max_steps is approaching, prioritize completing the most important parts of the task

View File

@@ -31,7 +31,7 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Scroll to see more elements if needed.
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). Do not spend more than 3-4 steps on a single captcha - if blocked, try alternative approaches or report the limitation.
- CAPTCHAs are automatically solved by the browser. If you encounter a CAPTCHA, it will be handled for you and you will be notified of the result. Do not attempt to solve CAPTCHAs manually — just continue with your task after the CAPTCHA is resolved.
- If the page is not fully loaded, use the wait action to allow content to render.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
@@ -46,7 +46,7 @@ Strictly follow these rules while using the browser and navigating the web:
- There are 2 types of tasks:
1. Very specific step by step instructions: Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
- If you get stuck e.g. with logins in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search. CAPTCHAs are handled automatically.
- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in <available_file_paths>. You can either read the file or scroll in the page to see more.
- Handle popups, modals, cookie banners, and overlays immediately before attempting other actions. Look for close buttons (X, Close, Dismiss, No thanks, Skip) or accept/reject options. If a popup blocks interaction with the main page, handle it first. Many websites show cookie consent dialogs, newsletter popups, or promotional overlays that must be dismissed.
- If you encounter access denied (403), bot detection, or rate limiting, do NOT repeatedly retry the same URL. Try alternative approaches or report the limitation. Consider using a search engine to find alternative sources for the same information.
@@ -93,9 +93,9 @@ BEFORE calling `done` with `success=true`, you MUST perform this verification:
3. **Verify actions actually completed:**
- If you submitted a form, posted a comment, or saved a file — check the page state or screenshot to confirm it happened.
- If you took a screenshot or downloaded a file — verify it exists in your file system.
4. **Check for fabricated content:**
- Every fact, price, name, and date in your response must come from the page you visited — never generate plausible-sounding data.
5. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
4. **Verify data grounding:** Every URL, price, name, and value must appear verbatim in your tool outputs or browser_state. Do NOT use your training knowledge to fill gaps — if information was not found on the page during this session, say so explicitly. Never fabricate or invent values.
5. **Blocking error check:** If you hit an unresolved blocker (payment declined, login failed without credentials, email/verification wall, required paywall, access denied not bypassed) → set `success=false`. Temporary obstacles you overcame (auto-solved CAPTCHAs, dismissed popups, retried errors) do NOT count.
6. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
Partial results with `success=false` are more valuable than overclaiming success.
</pre_done_verification>
</task_completion_rules>
@@ -166,7 +166,7 @@ Always put `memory` field before the `action` field.
Your memory field should include your reasoning. Apply these patterns:
- Did the previous action succeed? Verify using screenshot as ground truth.
- What is the current state relative to the user request?
- Are there any obstacles (popups, captcha, login walls)?
- Are there any obstacles (popups, login walls)? CAPTCHAs are solved automatically.
- What specific next step will make progress toward the goal?
- If stuck, what alternative approach should you try?
- What information should be remembered for later steps?
@@ -219,7 +219,7 @@ When encountering errors or unexpected states:
2. Check if a popup, modal, or overlay is blocking interaction
3. If an element is not found, scroll to reveal more content
4. If an action fails repeatedly (2-3 times), try an alternative approach
5. If blocked by login/captcha/403, consider alternative sites or search engines
5. If blocked by login/403, consider alternative sites or search engines. CAPTCHAs are solved automatically.
6. If the page structure is different than expected, re-analyze and adapt
7. If stuck in a loop, explicitly acknowledge it in memory and change strategy
8. If max_steps is approaching, prioritize completing the most important parts of the task
@@ -230,7 +230,7 @@ When encountering errors or unexpected states:
3. ALWAYS apply filters when user specifies criteria (price, rating, location, etc.)
4. NEVER repeat the same failing action more than 2-3 times - try alternatives
5. NEVER assume success - always verify from screenshot or browser state
6. If blocked by captcha/login/403, try alternative approaches rather than retrying
6. CAPTCHAs are solved automatically. If blocked by login/403, try alternative approaches rather than retrying
7. Put ALL relevant findings in done action's text field
8. Match user's requested output format exactly
9. Track progress in memory to avoid loops

View File

@@ -1,5 +1,9 @@
You are a browser-use agent operating in thinking mode. You automate browser tasks by outputting structured JSON actions.
<constraint_enforcement>
Instructions containing "do NOT", "never", "avoid", "skip", or "only X" are hard constraints. Before each action, check: does this violate any constraint? If yes, stop and find an alternative.
</constraint_enforcement>
<output>
You must ALWAYS respond with a valid JSON in this exact format:
{{
@@ -10,4 +14,5 @@ You must ALWAYS respond with a valid JSON in this exact format:
"action": [{{"action_name": {{...params...}}}}]
}}
Action list should NEVER be empty.
DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
</output>

View File

@@ -1,5 +1,9 @@
You are a browser-use agent operating in flash mode. You automate browser tasks by outputting structured JSON actions.
<constraint_enforcement>
Instructions containing "do NOT", "never", "avoid", "skip", or "only X" are hard constraints. Before each action, check: does this violate any constraint? If yes, stop and find an alternative.
</constraint_enforcement>
<output>
You must respond with a valid JSON in this exact format:
{{
@@ -7,4 +11,5 @@ You must respond with a valid JSON in this exact format:
"action": [{{"action_name": {{...params...}}}}]
}}
Action list should NEVER be empty.
DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
</output>

View File

@@ -1,5 +1,9 @@
You are a browser-use agent. You automate browser tasks by outputting structured JSON actions.
<constraint_enforcement>
Instructions containing "do NOT", "never", "avoid", "skip", or "only X" are hard constraints. Before each action, check: does this violate any constraint? If yes, stop and find an alternative.
</constraint_enforcement>
<output>
You must ALWAYS respond with a valid JSON in this exact format:
{{
@@ -9,4 +13,5 @@ You must ALWAYS respond with a valid JSON in this exact format:
"action": [{{"action_name": {{...params...}}}}]
}}
Action list should NEVER be empty.
DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
</output>

View File

@@ -12,4 +12,5 @@ You are allowed to use a maximum of {max_actions} actions per step. Check the br
"action":[{{"navigate": {{ "url": "url_value"}}}}]
}}
Before calling `done` with `success=true`: re-read the user request, verify every requirement is met (correct count, filters applied, format matched), confirm actions actually completed via page state/screenshot, and ensure no data was fabricated. If anything is unmet or uncertain, set `success` to `false`.
DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found in the browser state or tool outputs, say so explicitly. Never fabricate values.
</output>

View File

@@ -27,4 +27,5 @@ You are allowed to use a maximum of {max_actions} actions per step. Check the br
Always put `memory` field before the `action` field.
Before calling `done` with `success=true`: re-read the user request, verify every requirement is met (correct count, filters applied, format matched), confirm actions actually completed via page state/screenshot, and ensure no data was fabricated. If anything is unmet or uncertain, set `success` to `false`.
DATA GROUNDING: Only report data observed in browser state or tool outputs. Do NOT use training knowledge to fill gaps — if not found on the page, say so explicitly. Never fabricate values.
</output>

View File

@@ -65,7 +65,7 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed.
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack). Do not spend more than 3-4 steps on a single captcha - if blocked, try alternative approaches or report the limitation.
- CAPTCHAs are automatically solved by the browser. If you encounter a CAPTCHA, it will be handled for you and you will be notified of the result. Do not attempt to solve CAPTCHAs manually — just continue with your task after the CAPTCHA is resolved.
- If the page is not fully loaded, use the wait action.
- You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
@@ -81,7 +81,7 @@ Strictly follow these rules while using the browser and navigating the web:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
- If you get stuck e.g. with logins in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search. CAPTCHAs are handled automatically.
- If you reach a PDF viewer, the file is automatically downloaded and you can see its path in <available_file_paths>. You can either read the file or scroll in the page to see more.
- Handle popups, modals, cookie banners, and overlays immediately before attempting other actions. Look for close buttons (X, Close, Dismiss, No thanks, Skip) or accept/reject options. If a popup blocks interaction with the main page, handle it first.
- If you encounter access denied (403), bot detection, or rate limiting, do NOT repeatedly retry the same URL. Try alternative approaches or report the limitation.
@@ -130,9 +130,9 @@ BEFORE calling `done` with `success=true`, you MUST perform this verification:
3. **Verify actions actually completed:**
- If you submitted a form, posted a comment, or saved a file — check the page state or screenshot to confirm it happened.
- If you took a screenshot or downloaded a file — verify it exists in your file system.
4. **Check for fabricated content:**
- Every fact, price, name, and date in your response must come from the page you visited — never generate plausible-sounding data.
5. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
4. **Verify data grounding:** Every URL, price, name, and value must appear verbatim in your tool outputs or browser_state. Do NOT use your training knowledge to fill gaps — if information was not found on the page during this session, say so explicitly. Never fabricate or invent values.
5. **Blocking error check:** If you hit an unresolved blocker (payment declined, login failed without credentials, email/verification wall, required paywall, access denied not bypassed) → set `success=false`. Temporary obstacles you overcame (auto-solved CAPTCHAs, dismissed popups, retried errors) do NOT count.
6. **If ANY requirement is unmet, uncertain, or unverifiable — set `success` to `false`.**
Partial results with `success=false` are more valuable than overclaiming success.
</pre_done_verification>
</task_completion_rules>
@@ -224,7 +224,7 @@ Action list should NEVER be empty.
3. ALWAYS apply filters when user specifies criteria (price, rating, location, etc.)
4. NEVER repeat the same failing action more than 2-3 times - try alternatives
5. NEVER assume success - always verify from screenshot or browser state
6. If blocked by captcha/login/403, try alternative approaches rather than retrying
6. CAPTCHAs are solved automatically. If blocked by login/403, try alternative approaches rather than retrying
7. Put ALL relevant findings in done action's text field
8. Match user's requested output format exactly
9. Track progress in memory to avoid loops
@@ -238,7 +238,7 @@ When encountering errors or unexpected states:
2. Check if a popup, modal, or overlay is blocking interaction
3. If an element is not found, scroll to reveal more content
4. If an action fails repeatedly (2-3 times), try an alternative approach
5. If blocked by login/captcha/403, consider alternative sites or search engines
5. If blocked by login/403, consider alternative sites or search engines. CAPTCHAs are solved automatically.
6. If the page structure is different than expected, re-analyze and adapt
7. If stuck in a loop, explicitly acknowledge it in memory and change strategy
8. If max_steps is approaching, prioritize completing the most important parts of the task

View File

@@ -27,6 +27,7 @@ from browser_use.filesystem.file_system import FileSystemState
from browser_use.llm.base import BaseChatModel
from browser_use.tokens.views import UsageSummary
from browser_use.tools.registry.views import ActionModel
from browser_use.utils import collect_sensitive_data_values, redact_sensitive_string
logger = logging.getLogger(__name__)
@@ -35,7 +36,7 @@ class MessageCompactionSettings(BaseModel):
"""Summarizes older history into a compact memory block to reduce prompt size."""
enabled: bool = True
compact_every_n_steps: int = 15
compact_every_n_steps: int = 25
trigger_char_count: int | None = None # Min char floor; set via trigger_token_count if preferred
trigger_token_count: int | None = None # Alternative to trigger_char_count (~4 chars/token)
chars_per_token: float = 4.0
@@ -88,6 +89,7 @@ class AgentSettings(BaseModel):
# Loop detection settings
loop_detection_window: int = 20 # Rolling window size for action similarity tracking
loop_detection_enabled: bool = True # Whether to enable loop detection nudges
max_clickable_elements_length: int = 40000 # Max characters for clickable elements in prompt
class PageFingerprint(BaseModel):
@@ -302,13 +304,6 @@ class JudgementResult(BaseModel):
)
class SimpleJudgeResult(BaseModel):
"""Result of lightweight always-on judge that validates agent success claims."""
is_correct: bool = Field(description='True if the agent response genuinely satisfies the task requirements')
reason: str = Field(default='', description='Brief explanation if not correct')
class ActionResult(BaseModel):
"""Result of executing an action"""
@@ -518,29 +513,13 @@ class AgentHistory(BaseModel):
if not sensitive_data:
return value
# Collect all sensitive values, immediately converting old format to new format
sensitive_values: dict[str, str] = {}
# Process all sensitive data entries
for key_or_domain, content in sensitive_data.items():
if isinstance(content, dict):
# Already in new format: {domain: {key: value}}
for key, val in content.items():
if val: # Skip empty values
sensitive_values[key] = val
elif content: # Old format: {key: value} - convert to new format internally
# We treat this as if it was {'http*://*': {key_or_domain: content}}
sensitive_values[key_or_domain] = content
sensitive_values = collect_sensitive_data_values(sensitive_data)
# If there are no valid sensitive data entries, just return the original value
if not sensitive_values:
return value
# Replace all valid sensitive data values with their placeholder tags
for key, val in sensitive_values.items():
value = value.replace(val, f'<secret>{key}</secret>')
return value
return redact_sensitive_string(value, sensitive_values)
def _filter_sensitive_data_from_dict(
self, data: dict[str, Any], sensitive_data: dict[str, str | dict[str, str]] | None
@@ -651,7 +630,7 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
data = self.model_dump(sensitive_data=sensitive_data)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
json.dump(data, f, indent=2, ensure_ascii=False)
except Exception as e:
raise e
@@ -696,14 +675,18 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
@classmethod
def load_from_dict(cls, data: dict[str, Any], output_model: type[AgentOutput]) -> AgentHistoryList:
# loop through history and validate output_model actions to enrich with custom actions
for h in data['history']:
if h['model_output']:
if isinstance(h['model_output'], dict):
h['model_output'] = output_model.model_validate(h['model_output'])
for h in data.get('history', []):
# Use .get() to avoid KeyError on incomplete or legacy history entries
model_output = h.get('model_output')
if model_output:
if isinstance(model_output, dict):
h['model_output'] = output_model.model_validate(model_output)
else:
h['model_output'] = None
if 'interacted_element' not in h['state']:
h['state']['interacted_element'] = None
state = h.get('state') or {}
if 'interacted_element' not in state:
state['interacted_element'] = None
h['state'] = state
history = cls.model_validate(data)
return history
@@ -733,8 +716,10 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
def final_result(self) -> None | str:
"""Final result from history"""
if self.history and self.history[-1].result[-1].extracted_content:
return self.history[-1].result[-1].extracted_content
if self.history and len(self.history[-1].result) > 0:
last_result = self.history[-1].result[-1]
if last_result.extracted_content:
return last_result.extracted_content
return None
def is_done(self) -> bool:

View File

@@ -50,7 +50,8 @@ class CloudBrowserClient:
if not api_token:
raise CloudBrowserAuthError(
'No authentication token found. Please set BROWSER_USE_API_KEY environment variable to authenticate with the cloud service. You can also create an API key at https://cloud.browser-use.com/new-api-key'
'BROWSER_USE_API_KEY is not set. To use cloud browsers, get a key at:\n'
'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=use_cloud'
)
headers = {'X-Browser-Use-API-Key': api_token, 'Content-Type': 'application/json', **(extra_headers or {})}
@@ -65,7 +66,8 @@ class CloudBrowserClient:
if response.status_code == 401:
raise CloudBrowserAuthError(
'Authentication failed. Please make sure you have set BROWSER_USE_API_KEY environment variable to authenticate with the cloud service. You can also create an API key at https://cloud.browser-use.com/new-api-key'
'BROWSER_USE_API_KEY is invalid. Get a new key at:\n'
'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=use_cloud'
)
elif response.status_code == 403:
raise CloudBrowserAuthError('Access forbidden. Please check your browser-use cloud subscription status.')
@@ -137,7 +139,8 @@ class CloudBrowserClient:
if not api_token:
raise CloudBrowserAuthError(
'No authentication token found. Please set BROWSER_USE_API_KEY environment variable to authenticate with the cloud service. You can also create an API key at https://cloud.browser-use.com/new-api-key'
'BROWSER_USE_API_KEY is not set. To use cloud browsers, get a key at:\n'
'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=use_cloud'
)
headers = {'X-Browser-Use-API-Key': api_token, 'Content-Type': 'application/json', **(extra_headers or {})}
@@ -192,7 +195,10 @@ class CloudBrowserClient:
raise CloudBrowserError(f'Unexpected error stopping cloud browser: {e}')
async def close(self):
"""Close the HTTP client and cleanup any active sessions."""
"""Close the HTTP client and cleanup any active sessions.
Safe to call multiple times — subsequent calls are no-ops.
"""
# Try to stop current session if active
if self.current_session_id:
try:
@@ -200,4 +206,5 @@ class CloudBrowserClient:
except Exception as e:
logger.debug(f'Failed to stop cloud browser session during cleanup: {e}')
await self.client.aclose()
if not self.client.is_closed:
await self.client.aclose()

View File

@@ -59,6 +59,13 @@ class CreateBrowserRequest(BaseModel):
title='Cloud Timeout',
)
enable_recording: bool = Field(
default=False,
alias='enableRecording',
description='Enable session recording for playback in the cloud dashboard.',
title='Enable Recording',
)
CloudBrowserParams = CreateBrowserRequest # alias for easier readability

View File

@@ -119,7 +119,7 @@ class NavigateToUrlEvent(BaseEvent[None]):
# existing_tab: PageHandle | None = None # TODO
# time limits enforced by bubus, not exposed to LLM:
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_NavigateToUrlEvent', 15.0)) # seconds
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_NavigateToUrlEvent', 30.0)) # seconds
class ClickElementEvent(ElementSelectedEvent[dict[str, Any] | None]):
@@ -406,7 +406,7 @@ class TabClosedEvent(BaseEvent):
# new_focus_target_id: int | None = None
# new_focus_url: str | None = None
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_TabClosedEvent', 10.0)) # seconds
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_TabClosedEvent', 3.0)) # seconds
# TODO: emit this when DOM changes significantly, inner frame navigates, form submits, history.pushState(), etc.
@@ -471,6 +471,26 @@ class BrowserErrorEvent(BaseEvent):
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_BrowserErrorEvent', 30.0)) # seconds
class BrowserReconnectingEvent(BaseEvent):
"""WebSocket reconnection attempt is starting."""
cdp_url: str
attempt: int
max_attempts: int
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_BrowserReconnectingEvent', 30.0)) # seconds
class BrowserReconnectedEvent(BaseEvent):
"""WebSocket reconnection succeeded."""
cdp_url: str
attempt: int
downtime_seconds: float
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_BrowserReconnectedEvent', 30.0)) # seconds
# ============================================================================
# Storage State Events
# ============================================================================
@@ -576,6 +596,42 @@ class DialogOpenedEvent(BaseEvent):
# target_id: TargetID # TODO: add this to avoid needing target_id_from_frame() later
# ============================================================================
# Captcha Solver Events
# ============================================================================
class CaptchaSolverStartedEvent(BaseEvent):
"""Captcha solving started by the browser proxy.
Emitted when the browser proxy detects a CAPTCHA and begins solving it.
The agent should wait for a corresponding CaptchaSolverFinishedEvent before proceeding.
"""
target_id: TargetID
vendor: str # e.g. 'cloudflare', 'recaptcha', 'hcaptcha', 'datadome', 'perimeterx', 'geetest'
url: str
started_at: int # Unix millis
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_CaptchaSolverStartedEvent', 5.0))
class CaptchaSolverFinishedEvent(BaseEvent):
"""Captcha solving finished by the browser proxy.
Emitted when the browser proxy finishes solving a CAPTCHA (successfully or not).
"""
target_id: TargetID
vendor: str
url: str
duration_ms: int
finished_at: int # Unix millis
success: bool # Whether the captcha was solved successfully
event_timeout: float | None = Field(default_factory=lambda: _get_timeout('TIMEOUT_CaptchaSolverFinishedEvent', 5.0))
# Note: Model rebuilding for forward references is handled in the importing modules
# Events with 'EnhancedDOMTreeNode' forward references (ClickElementEvent, TypeTextEvent,
# ScrollEvent, UploadFileEvent) need model_rebuild() called after imports are complete

View File

@@ -124,7 +124,7 @@ CHROME_DEFAULT_ARGS = [
'--disable-back-forward-cache', # Avoids surprises like main request not being intercepted during page.goBack().
'--disable-breakpad',
'--disable-client-side-phishing-detection',
'--disable-component-extensions-with-background-pages',
# '--disable-component-extensions-with-background-pages', # kills user-loaded extensions on Chrome 145+
'--disable-component-update', # Avoids unneeded network activity after startup.
'--no-default-browser-check',
# '--disable-default-apps',
@@ -150,7 +150,7 @@ CHROME_DEFAULT_ARGS = [
# added by us:
'--enable-features=NetworkService,NetworkServiceInProcess',
'--enable-network-information-downlink-max',
'--test-type=gpu',
# '--test-type=gpu', # blocks unpacked extension loading on Chrome 145+
'--disable-sync',
'--allow-legacy-extension-manifests',
'--allow-pre-commit-input',
@@ -430,14 +430,14 @@ class BrowserLaunchArgs(BaseModel):
if self.downloads_path is None:
import uuid
# Create unique directory in /tmp for downloads
# Create unique directory in system temp folder for downloads
unique_id = str(uuid.uuid4())[:8] # 8 characters
downloads_path = Path(f'/tmp/browser-use-downloads-{unique_id}')
downloads_path = Path(tempfile.gettempdir()) / f'browser-use-downloads-{unique_id}'
# Ensure path doesn't already exist (extremely unlikely but possible)
while downloads_path.exists():
unique_id = str(uuid.uuid4())[:8]
downloads_path = Path(f'/tmp/browser-use-downloads-{unique_id}')
downloads_path = Path(tempfile.gettempdir()) / f'browser-use-downloads-{unique_id}'
self.downloads_path = downloads_path
self.downloads_path.mkdir(parents=True, exist_ok=True)
@@ -602,6 +602,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
default_factory=_get_enable_default_extensions_default,
description="Enable automation-optimized extensions: ad blocking (uBlock Origin), cookie handling (I still don't care about cookies), and URL cleaning (ClearURLs). All extensions work automatically without manual intervention. Extensions are automatically downloaded and loaded when enabled. Can be disabled via BROWSER_USE_DISABLE_EXTENSIONS=1 environment variable.",
)
captcha_solver: bool = Field(
default=True,
description='Enable the captcha solver watchdog that listens for captcha events from the browser proxy. Automatically pauses agent steps while a CAPTCHA is being solved. Only active when the browser emits BrowserUse CDP events (e.g. Browser Use cloud browsers). Harmless when disabled or when events are not emitted.',
)
demo_mode: bool = Field(
default=False,
description='Enable demo mode side panel that streams agent logs directly inside the browser window (requires headless=False).',
@@ -933,6 +937,25 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
return args
@staticmethod
def _check_extension_manifest_version(ext_dir: Path, ext_name: str) -> bool:
"""Check that an extension uses Manifest V3. Returns False for MV2 extensions (unsupported by Chrome 145+)."""
import json
manifest_path = ext_dir / 'manifest.json'
if not manifest_path.exists():
return False
try:
with open(manifest_path, encoding='utf-8') as f:
manifest = json.load(f)
mv = manifest.get('manifest_version', 2)
if mv < 3:
logger.warning(f'Skipping {ext_name} extension: Manifest V{mv} is no longer supported by Chrome')
return False
return True
except Exception:
return False
def _ensure_default_extensions_downloaded(self) -> list[str]:
"""
Ensure default extensions are downloaded and cached locally.
@@ -940,23 +963,18 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
"""
# Extension definitions - optimized for automation and content extraction
# Combines uBlock Origin (ad blocking) + "I still don't care about cookies" (cookie banner handling)
# uBlock Origin Lite (ad blocking, MV3) + "I still don't care about cookies" (cookie banner handling)
extensions = [
{
'name': 'uBlock Origin',
'id': 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dcjpalhdlnbpafiamejdnhcphjbkeiagm%26uc',
'name': 'uBlock Origin Lite',
'id': 'ddkjiahejlhfcafbddmgiahcphecmpfh',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dddkjiahejlhfcafbddmgiahcphecmpfh%26uc',
},
{
'name': "I still don't care about cookies",
'id': 'edibdbjcniadpccecjdfdjjppcpchdlm',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dedibdbjcniadpccecjdfdjjppcpchdlm%26uc',
},
{
'name': 'ClearURLs',
'id': 'lckanjgmijmafbedllaakclkaicjfmnk',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=133&acceptformat=crx3&x=id%3Dlckanjgmijmafbedllaakclkaicjfmnk%26uc',
},
{
'name': 'Force Background Tab',
'id': 'gidlfommnbibbmegmgajdbikelkdcmcl',
@@ -994,7 +1012,8 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
# Check if extension is already extracted
if ext_dir.exists() and (ext_dir / 'manifest.json').exists():
# logger.debug(f'✅ Using cached {ext["name"]} extension from {_log_pretty_path(ext_dir)}')
if not self._check_extension_manifest_version(ext_dir, ext['name']):
continue
extension_paths.append(str(ext_dir))
loaded_extension_names.append(ext['name'])
continue
@@ -1011,6 +1030,9 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
logger.info(f'📂 Extracting {ext["name"]} extension...')
self._extract_extension(crx_file, ext_dir)
if not self._check_extension_manifest_version(ext_dir, ext['name']):
continue
extension_paths.append(str(ext_dir))
loaded_extension_names.append(ext['name'])
@@ -1149,7 +1171,6 @@ async function initialize(checkInitialized, magic) {{
zip_data = f.read()
# Write ZIP data to temp file and extract
import tempfile
with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
temp_zip.write(zip_data)

File diff suppressed because it is too large Load Diff

View File

@@ -401,6 +401,8 @@ class SessionManager:
if '-32001' not in error_str and 'Session with given id not found' not in error_str:
self.logger.debug(f'[SessionManager] Auto-attach failed for {target_type}: {e}')
from browser_use.browser.session import Target
async with self._lock:
# Track this session for the target
if target_id not in self._target_sessions:
@@ -409,23 +411,22 @@ class SessionManager:
self._target_sessions[target_id].add(session_id)
self._session_to_target[session_id] = target_id
# Create or update Target (source of truth for url/title)
if target_id not in self._targets:
from browser_use.browser.session import Target
target = Target(
target_id=target_id,
target_type=target_type,
url=target_info.get('url', 'about:blank'),
title=target_info.get('title', 'Unknown title'),
)
self._targets[target_id] = target
self.logger.debug(f'[SessionManager] Created target {target_id[:8]}... (type={target_type})')
else:
# Update existing target info
existing_target = self._targets[target_id]
existing_target.url = target_info.get('url', existing_target.url)
existing_target.title = target_info.get('title', existing_target.title)
# Create or update Target inside the same lock so that get_target() is never
# called in the window between _target_sessions being set and _targets being set.
if target_id not in self._targets:
target = Target(
target_id=target_id,
target_type=target_type,
url=target_info.get('url', 'about:blank'),
title=target_info.get('title', 'Unknown title'),
)
self._targets[target_id] = target
self.logger.debug(f'[SessionManager] Created target {target_id[:8]}... (type={target_type})')
else:
# Update existing target info
existing_target = self._targets[target_id]
existing_target.url = target_info.get('url', existing_target.url)
existing_target.title = target_info.get('title', existing_target.title)
# Create CDPSession (communication channel)
from browser_use.browser.session import CDPSession
@@ -441,6 +442,21 @@ class SessionManager:
# Add to sessions dict
self._sessions[session_id] = cdp_session
# If proxy auth is configured, enable Fetch auth handling on this session
# Avoids overwriting Target.attachedToTarget handlers elsewhere
try:
proxy_cfg = self.browser_session.browser_profile.proxy
username = proxy_cfg.username if proxy_cfg else None
password = proxy_cfg.password if proxy_cfg else None
if username and password:
await cdp_session.cdp_client.send.Fetch.enable(
params={'handleAuthRequests': True},
session_id=cdp_session.session_id,
)
self.logger.debug(f'[SessionManager] Fetch.enable(handleAuthRequests=True) on session {session_id[:8]}...')
except Exception as e:
self.logger.debug(f'[SessionManager] Fetch.enable on attached session failed: {type(e).__name__}: {e}')
self.logger.debug(
f'[SessionManager] Created session {session_id[:8]}... for target {target_id[:8]}... '
f'(total sessions: {len(self._sessions)})'

View File

@@ -1,5 +1,6 @@
"""Base watchdog class for browser monitoring components."""
import asyncio
import inspect
import time
from collections.abc import Iterable
@@ -73,10 +74,54 @@ class BaseWatchdog(BaseModel):
watchdog_instance = getattr(handler, '__self__', None)
watchdog_class_name = watchdog_instance.__class__.__name__ if watchdog_instance else 'Unknown'
# Events that should always run even when CDP is disconnected (lifecycle management)
LIFECYCLE_EVENT_NAMES = frozenset(
{
'BrowserStartEvent',
'BrowserStopEvent',
'BrowserStoppedEvent',
'BrowserLaunchEvent',
'BrowserErrorEvent',
'BrowserKillEvent',
'BrowserReconnectingEvent',
'BrowserReconnectedEvent',
}
)
# Create a wrapper function with unique name to avoid duplicate handler warnings
# Capture handler by value to avoid closure issues
def make_unique_handler(actual_handler):
async def unique_handler(event):
# Circuit breaker: skip handler if CDP WebSocket is dead
# (prevents handlers from hanging on broken connections until timeout)
# Lifecycle events are exempt — they manage browser start/stop
if event.event_type not in LIFECYCLE_EVENT_NAMES and not browser_session.is_cdp_connected:
# If reconnection is in progress, wait for it instead of silently skipping
if browser_session.is_reconnecting:
wait_timeout = browser_session.RECONNECT_WAIT_TIMEOUT
browser_session.logger.debug(
f'🚌 [{watchdog_class_name}.{actual_handler.__name__}] ⏳ Waiting for reconnection ({wait_timeout}s)...'
)
try:
await asyncio.wait_for(browser_session._reconnect_event.wait(), timeout=wait_timeout)
except TimeoutError:
raise ConnectionError(
f'[{watchdog_class_name}.{actual_handler.__name__}] '
f'Reconnection wait timed out after {wait_timeout}s'
)
# After wait: check if reconnection actually succeeded
if not browser_session.is_cdp_connected:
raise ConnectionError(
f'[{watchdog_class_name}.{actual_handler.__name__}] Reconnection failed — CDP still not connected'
)
# Reconnection succeeded — fall through to execute handler normally
else:
# Not reconnecting — intentional stop, backward compat silent skip
browser_session.logger.debug(
f'🚌 [{watchdog_class_name}.{actual_handler.__name__}] ⚡ Skipped — CDP not connected'
)
return None
# just for debug logging, not used for anything else
parent_event = event_bus.event_history.get(event.event_parent_id) if event.event_parent_id else None
grandparent_event = (

View File

@@ -59,11 +59,14 @@ class AboutBlankWatchdog(BaseWatchdog):
async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
"""Check tabs when a tab is closed and proactively create about:blank if needed."""
# logger.debug('[AboutBlankWatchdog] Tab closing, checking if we need to create about:blank tab')
# Don't create new tabs if browser is shutting down
if self._stopping:
# logger.debug('[AboutBlankWatchdog] Browser is stopping, not creating new tabs')
return
# Don't attempt CDP operations if the WebSocket is dead — dispatching
# NavigateToUrlEvent on a broken connection will hang until timeout
if not self.browser_session.is_cdp_connected:
self.logger.debug('[AboutBlankWatchdog] CDP not connected, skipping tab recovery')
return
# Check if we're about to close the last tab (event happens BEFORE tab closes)
@@ -89,6 +92,9 @@ class AboutBlankWatchdog(BaseWatchdog):
async def _check_and_ensure_about_blank_tab(self) -> None:
"""Check current tabs and ensure exactly one about:blank tab with animation exists."""
try:
if not self.browser_session.is_cdp_connected:
return
# For quick checks, just get page targets without titles to reduce noise
page_targets = await self.browser_session._cdp_get_all_pages()

View File

@@ -0,0 +1,207 @@
"""Captcha solver watchdog — monitors captcha events from the browser proxy.
Listens for BrowserUse.captchaSolverStarted/Finished CDP events and exposes a
wait_if_captcha_solving() method that the agent step loop uses to block until
a captcha is resolved (with a configurable timeout).
NOTE: Only a single captcha solve is tracked at a time. If multiple captchas
overlap (e.g. rapid successive navigations), only the latest one is tracked and
earlier in-flight waits may return prematurely.
"""
import asyncio
from dataclasses import dataclass
from typing import Any, ClassVar, Literal
from bubus import BaseEvent
from cdp_use.cdp.browseruse.events import CaptchaSolverFinishedEvent as CDPCaptchaSolverFinishedEvent
from cdp_use.cdp.browseruse.events import CaptchaSolverStartedEvent as CDPCaptchaSolverStartedEvent
from pydantic import PrivateAttr
from browser_use.browser.events import (
BrowserConnectedEvent,
BrowserStoppedEvent,
CaptchaSolverFinishedEvent,
CaptchaSolverStartedEvent,
_get_timeout,
)
from browser_use.browser.watchdog_base import BaseWatchdog
CaptchaResultType = Literal['success', 'failed', 'timeout', 'unknown']
@dataclass
class CaptchaWaitResult:
"""Result returned by wait_if_captcha_solving() when the agent had to wait."""
waited: bool
vendor: str
url: str
duration_ms: int
result: CaptchaResultType
class CaptchaWatchdog(BaseWatchdog):
"""Monitors captcha solver events from the browser proxy.
When the proxy detects a CAPTCHA and starts solving it, a CDP event
``BrowserUse.captchaSolverStarted`` is sent over the WebSocket. This
watchdog catches that event and blocks the agent's step loop (via
``wait_if_captcha_solving``) until ``BrowserUse.captchaSolverFinished``
arrives or the configurable timeout expires.
"""
# Event contracts
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [
BrowserConnectedEvent,
BrowserStoppedEvent,
]
EMITS: ClassVar[list[type[BaseEvent]]] = [
CaptchaSolverStartedEvent,
CaptchaSolverFinishedEvent,
]
# --- private state ---
_captcha_solving: bool = PrivateAttr(default=False)
_captcha_solved_event: asyncio.Event = PrivateAttr(default_factory=asyncio.Event)
_captcha_info: dict[str, Any] = PrivateAttr(default_factory=dict)
_captcha_result: CaptchaResultType = PrivateAttr(default='unknown')
_captcha_duration_ms: int = PrivateAttr(default=0)
_cdp_handlers_registered: bool = PrivateAttr(default=False)
def model_post_init(self, __context: Any) -> None:
# Start in "not blocked" state so callers never wait when there is no captcha.
self._captcha_solved_event.set()
# ------------------------------------------------------------------
# Event handlers
# ------------------------------------------------------------------
async def on_BrowserConnectedEvent(self, event: BrowserConnectedEvent) -> None:
"""Register CDP event handlers for BrowserUse captcha solver events."""
if self._cdp_handlers_registered:
self.logger.debug('CaptchaWatchdog: CDP handlers already registered, skipping')
return
cdp_client = self.browser_session.cdp_client
def _on_captcha_started(event_data: CDPCaptchaSolverStartedEvent, session_id: str | None) -> None:
try:
self._captcha_solving = True
self._captcha_result = 'unknown'
self._captcha_duration_ms = 0
self._captcha_info = {
'vendor': event_data.get('vendor', 'unknown'),
'url': event_data.get('url', ''),
'targetId': event_data.get('targetId', ''),
'startedAt': event_data.get('startedAt', 0),
}
# Block any waiter
self._captcha_solved_event.clear()
vendor = self._captcha_info['vendor']
url = self._captcha_info['url']
self.logger.info(f'🔒 Captcha solving started: {vendor} on {url}')
self.event_bus.dispatch(
CaptchaSolverStartedEvent(
target_id=event_data.get('targetId', ''),
vendor=vendor,
url=url,
started_at=event_data.get('startedAt', 0),
)
)
except Exception:
self.logger.exception('Error handling captchaSolverStarted CDP event')
# Ensure consistent state: unblock any waiter
self._captcha_solving = False
self._captcha_solved_event.set()
def _on_captcha_finished(event_data: CDPCaptchaSolverFinishedEvent, session_id: str | None) -> None:
try:
success = event_data.get('success', False)
self._captcha_solving = False
self._captcha_duration_ms = event_data.get('durationMs', 0)
self._captcha_result = 'success' if success else 'failed'
vendor = event_data.get('vendor', self._captcha_info.get('vendor', 'unknown'))
url = event_data.get('url', self._captcha_info.get('url', ''))
duration_s = self._captcha_duration_ms / 1000
self.logger.info(f'🔓 Captcha solving finished: {self._captcha_result}{vendor} on {url} ({duration_s:.1f}s)')
# Unblock any waiter
self._captcha_solved_event.set()
self.event_bus.dispatch(
CaptchaSolverFinishedEvent(
target_id=event_data.get('targetId', ''),
vendor=vendor,
url=url,
duration_ms=self._captcha_duration_ms,
finished_at=event_data.get('finishedAt', 0),
success=success,
)
)
except Exception:
self.logger.exception('Error handling captchaSolverFinished CDP event')
# Ensure consistent state: unblock any waiter
self._captcha_solving = False
self._captcha_solved_event.set()
cdp_client.register.BrowserUse.captchaSolverStarted(_on_captcha_started)
cdp_client.register.BrowserUse.captchaSolverFinished(_on_captcha_finished)
self._cdp_handlers_registered = True
self.logger.debug('🔒 CaptchaWatchdog: registered CDP event handlers for BrowserUse captcha events')
async def on_BrowserStoppedEvent(self, event: BrowserStoppedEvent) -> None:
"""Clear captcha state when the browser disconnects so nothing hangs."""
self._captcha_solving = False
self._captcha_result = 'unknown'
self._captcha_duration_ms = 0
self._captcha_info = {}
self._captcha_solved_event.set()
self._cdp_handlers_registered = False
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def wait_if_captcha_solving(self, timeout: float | None = None) -> CaptchaWaitResult | None:
"""Wait if a captcha is currently being solved.
Returns:
``None`` if no captcha was in progress.
A ``CaptchaWaitResult`` with the outcome otherwise.
"""
if not self._captcha_solving:
return None
if timeout is None:
timeout = _get_timeout('TIMEOUT_CaptchaSolverWait', 120.0)
assert timeout is not None
vendor = self._captcha_info.get('vendor', 'unknown')
url = self._captcha_info.get('url', '')
self.logger.info(f'⏳ Waiting for {vendor} captcha to be solved on {url} (timeout={timeout}s)...')
try:
await asyncio.wait_for(self._captcha_solved_event.wait(), timeout=timeout)
return CaptchaWaitResult(
waited=True,
vendor=vendor,
url=url,
duration_ms=self._captcha_duration_ms,
result=self._captcha_result,
)
except TimeoutError:
# Timed out — unblock and report
self._captcha_solving = False
self._captcha_solved_event.set()
self.logger.warning(f'⏰ Captcha wait timed out after {timeout}s for {vendor} on {url}')
return CaptchaWaitResult(
waited=True,
vendor=vendor,
url=url,
duration_ms=int(timeout * 1000),
result='timeout',
)

View File

@@ -518,6 +518,11 @@ class DefaultActionWatchdog(BaseWatchdog):
raise BrowserError(error_msg)
try:
def invalidate_dom_cache() -> None:
if self.browser_session._dom_watchdog:
self.browser_session._dom_watchdog.clear_cache()
# Convert direction and amount to pixels
# Positive pixels = scroll down, negative = scroll up
pixels = event.amount if event.direction == 'down' else -event.amount
@@ -547,6 +552,7 @@ class DefaultActionWatchdog(BaseWatchdog):
# Wait a bit for the scroll to settle and DOM to update
await asyncio.sleep(0.2)
invalidate_dom_cache()
return None
# Perform target-level scroll
@@ -554,6 +560,7 @@ class DefaultActionWatchdog(BaseWatchdog):
# Note: We don't clear cached state here - let multi_act handle DOM change detection
# by explicitly rebuilding and comparing when needed
invalidate_dom_cache()
# Log success
self.logger.debug(f'📜 Scrolled {event.direction} by {event.amount} pixels')
@@ -612,10 +619,48 @@ class DefaultActionWatchdog(BaseWatchdog):
// Simple containment-based clickability logic
const isClickable = this === elementAtPoint ||
let isClickable = this === elementAtPoint ||
this.contains(elementAtPoint) ||
elementAtPoint.contains(this);
// Check label-input associations when containment check fails
if (!isClickable) {
const target = this;
const atPoint = elementAtPoint;
// Case 1: target is <input>, atPoint is its associated <label> (or child of that label)
if (target.tagName === 'INPUT' && target.id) {
const escapedId = CSS.escape(target.id);
const assocLabel = document.querySelector('label[for="' + escapedId + '"]');
if (assocLabel && (assocLabel === atPoint || assocLabel.contains(atPoint))) {
isClickable = true;
}
}
// Case 2: target is <input>, atPoint is inside a <label> ancestor that wraps the target
if (!isClickable && target.tagName === 'INPUT') {
let ancestor = atPoint;
for (let i = 0; i < 3 && ancestor; i++) {
if (ancestor.tagName === 'LABEL' && ancestor.contains(target)) {
isClickable = true;
break;
}
ancestor = ancestor.parentElement;
}
}
// Case 3: target is <label>, atPoint is the associated <input>
if (!isClickable && target.tagName === 'LABEL') {
if (target.htmlFor && atPoint.tagName === 'INPUT' && atPoint.id === target.htmlFor) {
isClickable = true;
}
// Also check if atPoint is an input inside the label
if (!isClickable && atPoint.tagName === 'INPUT' && target.contains(atPoint)) {
isClickable = true;
}
}
}
return {
targetInfo: getElementInfo(this),
elementAtPointInfo: getElementInfo(elementAtPoint),
@@ -686,6 +731,32 @@ class DefaultActionWatchdog(BaseWatchdog):
# Get element bounds
backend_node_id = element_node.backend_node_id
# For checkbox/radio: capture pre-click state to verify toggle worked
is_toggle_element = tag_name == 'input' and element_type in ('checkbox', 'radio')
pre_click_checked: bool | None = None
checkbox_object_id: str | None = None
if is_toggle_element and backend_node_id:
try:
resolve_res = await cdp_session.cdp_client.send.DOM.resolveNode(
params={'backendNodeId': backend_node_id}, session_id=session_id
)
obj_info = resolve_res.get('object', {})
checkbox_object_id = obj_info.get('objectId') if obj_info else None
if not checkbox_object_id:
raise Exception('Failed to resolve checkbox element objectId')
state_res = await cdp_session.cdp_client.send.Runtime.callFunctionOn(
params={
'functionDeclaration': 'function() { return this.checked; }',
'objectId': checkbox_object_id,
'returnByValue': True,
},
session_id=session_id,
)
pre_click_checked = state_res.get('result', {}).get('value')
self.logger.debug(f'Checkbox pre-click state: checked={pre_click_checked}')
except Exception as e:
self.logger.debug(f'Could not capture pre-click checkbox state: {e}')
# Get viewport dimensions for visibility checks
layout_metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=session_id)
viewport_width = layout_metrics['layoutViewport']['clientWidth']
@@ -883,6 +954,43 @@ class DefaultActionWatchdog(BaseWatchdog):
self.logger.debug('🖱️ Clicked successfully using x,y coordinates')
# For checkbox/radio: verify state toggled, fall back to JS element.click() if not
if is_toggle_element and pre_click_checked is not None and checkbox_object_id:
try:
await asyncio.sleep(0.05)
state_res = await cdp_session.cdp_client.send.Runtime.callFunctionOn(
params={
'functionDeclaration': 'function() { return this.checked; }',
'objectId': checkbox_object_id,
'returnByValue': True,
},
session_id=session_id,
)
post_click_checked = state_res.get('result', {}).get('value')
if post_click_checked == pre_click_checked:
# CDP mouse events didn't toggle the checkbox — try JS element.click()
self.logger.debug(
f'Checkbox state unchanged after CDP click (checked={pre_click_checked}), using JS fallback'
)
await cdp_session.cdp_client.send.Runtime.callFunctionOn(
params={'functionDeclaration': 'function() { this.click(); }', 'objectId': checkbox_object_id},
session_id=session_id,
)
await asyncio.sleep(0.05)
final_res = await cdp_session.cdp_client.send.Runtime.callFunctionOn(
params={
'functionDeclaration': 'function() { return this.checked; }',
'objectId': checkbox_object_id,
'returnByValue': True,
},
session_id=session_id,
)
post_click_checked = final_res.get('result', {}).get('value')
self.logger.debug(f'Checkbox post-click state: checked={post_click_checked}')
return {'click_x': center_x, 'click_y': center_y, 'checked': post_click_checked}
except Exception as e:
self.logger.debug(f'Checkbox state verification failed (non-critical): {e}')
# Return coordinates as dict for metadata
return {'click_x': center_x, 'click_y': center_y}
@@ -1294,10 +1402,8 @@ class DefaultActionWatchdog(BaseWatchdog):
return True
else:
self.logger.debug(f'⚠️ JavaScript clear partially failed, field still contains: "{final_text}"')
return False
else:
self.logger.debug(f'❌ JavaScript clear failed: {clear_info.get("error", "Unknown error")}')
return False
except Exception as e:
self.logger.debug(f'JavaScript clear failed with exception: {e}')

View File

@@ -264,12 +264,16 @@ class DOMWatchdog(BaseWatchdog):
not_a_meaningful_website = page_url.lower().split(':', 1)[0] not in ('http', 'https')
# Check for pending network requests BEFORE waiting (so we can see what's loading)
# Timeout after 2s — on slow CI machines or heavy pages, this call can hang
# for 15s+ eating into the 30s BrowserStateRequestEvent budget.
pending_requests_before_wait = []
if not not_a_meaningful_website:
try:
pending_requests_before_wait = await self._get_pending_network_requests()
pending_requests_before_wait = await asyncio.wait_for(self._get_pending_network_requests(), timeout=2.0)
if pending_requests_before_wait:
self.logger.debug(f'🔍 Found {len(pending_requests_before_wait)} pending requests before stability wait')
except TimeoutError:
self.logger.debug('Pending network request check timed out (2s), skipping')
except Exception as e:
self.logger.debug(f'Failed to get pending requests before wait: {e}')
pending_requests = pending_requests_before_wait

View File

@@ -62,8 +62,8 @@ class DownloadsWatchdog(BaseWatchdog):
_download_cdp_session: Any = PrivateAttr(default=None) # Store CDP session reference
_cdp_event_tasks: set[asyncio.Task] = PrivateAttr(default_factory=set) # Track CDP event handler tasks
_cdp_downloads_info: dict[str, dict[str, Any]] = PrivateAttr(default_factory=dict) # Map guid -> info
_use_js_fetch_for_local: bool = PrivateAttr(default=False) # Guard JS fetch path for local regular downloads
_session_pdf_urls: dict[str, str] = PrivateAttr(default_factory=dict) # URL -> path for PDFs downloaded this session
_initial_downloads_snapshot: set[str] = PrivateAttr(default_factory=set) # Files present when watchdog started
_network_monitored_targets: set[str] = PrivateAttr(default_factory=set) # Track targets with network monitoring enabled
_detected_downloads: set[str] = PrivateAttr(default_factory=set) # Track detected download URLs to avoid duplicates
_network_callback_registered: bool = PrivateAttr(default=False) # Track if global network callback is registered
@@ -120,6 +120,15 @@ class DownloadsWatchdog(BaseWatchdog):
expanded_path.mkdir(parents=True, exist_ok=True)
self.logger.debug(f'[DownloadsWatchdog] Ensured downloads directory exists: {expanded_path}')
# Capture initial files to detect new downloads reliably
if expanded_path.exists():
for f in expanded_path.iterdir():
if f.is_file() and not f.name.startswith('.'):
self._initial_downloads_snapshot.add(f.name)
self.logger.debug(
f'[DownloadsWatchdog] Captured initial downloads: {len(self._initial_downloads_snapshot)} files'
)
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
"""Monitor new tabs for downloads."""
# logger.info(f'[DownloadsWatchdog] TabCreatedEvent received for tab {event.target_id[-4:]}: {event.url}')
@@ -192,6 +201,7 @@ class DownloadsWatchdog(BaseWatchdog):
self._session_pdf_urls.clear()
self._network_monitored_targets.clear()
self._detected_downloads.clear()
self._initial_downloads_snapshot.clear()
self._network_callback_registered = False
async def on_NavigationCompleteEvent(self, event: NavigationCompleteEvent) -> None:
@@ -326,10 +336,31 @@ class DownloadsWatchdog(BaseWatchdog):
except (KeyError, AttributeError):
pass
else:
# No local file path provided, local polling in _handle_cdp_download will handle it
self.logger.debug(
'[DownloadsWatchdog] No filePath in progress event (local); polling will handle detection'
)
# No filePath provided - detect by comparing with initial snapshot
self.logger.debug('[DownloadsWatchdog] No filePath in progress event; detecting via filesystem')
downloads_path = self.browser_session.browser_profile.downloads_path
if downloads_path:
downloads_dir = Path(downloads_path).expanduser().resolve()
if downloads_dir.exists():
for f in downloads_dir.iterdir():
if (
f.is_file()
and not f.name.startswith('.')
and f.name not in self._initial_downloads_snapshot
):
# Check file has content before processing
if f.stat().st_size > 4:
# Found a new file! Add to snapshot immediately to prevent duplicate detection
self._initial_downloads_snapshot.add(f.name)
self.logger.debug(f'[DownloadsWatchdog] Detected new download: {f.name}')
self._track_download(str(f))
# Mark as handled
try:
if guid in self._cdp_downloads_info:
self._cdp_downloads_info[guid]['handled'] = True
except (KeyError, AttributeError):
pass
break
else:
# Remote browser: do not touch local filesystem. Fallback to downloadPath+suggestedFilename
info = self._cdp_downloads_info.get(guid, {})
@@ -456,17 +487,24 @@ class DownloadsWatchdog(BaseWatchdog):
response = event.get('response', {})
url = response.get('url', '')
content_type = response.get('mimeType', '').lower()
headers = response.get('headers', {})
headers = {
k.lower(): v for k, v in response.get('headers', {}).items()
} # Normalize for case-insensitive lookup
request_type = event.get('type', '')
# Skip non-HTTP URLs (data:, about:, chrome-extension:, etc.)
if not url.startswith('http'):
return
# Skip fetch/XHR - real browsers don't download PDFs from programmatic requests
if request_type in ('Fetch', 'XHR'):
return
# Check if it's a PDF
is_pdf = 'application/pdf' in content_type
# Check if it's marked as download via Content-Disposition header
content_disposition = headers.get('content-disposition', '').lower()
content_disposition = str(headers.get('content-disposition', '')).lower()
is_download_attachment = 'attachment' in content_disposition
# Filter out image/video/audio files even if marked as attachment
@@ -518,6 +556,14 @@ class DownloadsWatchdog(BaseWatchdog):
if not (is_pdf or is_download_attachment):
return
# If already downloaded this URL and file still exists, do nothing
existing_path = self._session_pdf_urls.get(url)
if existing_path:
if os.path.exists(existing_path):
return
# Stale cache entry, allow re-download
del self._session_pdf_urls[url]
# Check if we've already processed this URL in this session
if url in self._detected_downloads:
self.logger.debug(f'[DownloadsWatchdog] Already detected download: {url[:80]}...')
@@ -543,6 +589,7 @@ class DownloadsWatchdog(BaseWatchdog):
# Trigger download asynchronously in background (don't block event handler)
async def download_in_background():
# Don't permanently block re-processing this URL if download fails
try:
download_path = await self.download_file_from_url(
url=url,
@@ -557,6 +604,9 @@ class DownloadsWatchdog(BaseWatchdog):
self.logger.warning(f'[DownloadsWatchdog] ⚠️ Failed to download: {url[:80]}...')
except Exception as e:
self.logger.error(f'[DownloadsWatchdog] Error downloading in background: {type(e).__name__}: {e}')
finally:
# Allow future detections of the same URL
self._detected_downloads.discard(url)
# Create background task
task = create_task_with_error_handling(
@@ -611,8 +661,13 @@ class DownloadsWatchdog(BaseWatchdog):
# Check if already downloaded in this session
if url in self._session_pdf_urls:
existing_path = self._session_pdf_urls[url]
self.logger.debug(f'[DownloadsWatchdog] File already downloaded in session: {existing_path}')
return existing_path
if os.path.exists(existing_path):
self.logger.debug(f'[DownloadsWatchdog] File already downloaded in session: {existing_path}')
return existing_path
# Stale cache entry: the file was removed/cleaned up after we cached it.
self.logger.debug(f'[DownloadsWatchdog] Cached download path no longer exists, re-downloading: {existing_path}')
del self._session_pdf_urls[url]
try:
# Get or create CDP session for this target
@@ -814,107 +869,6 @@ class DownloadsWatchdog(BaseWatchdog):
# We just need to wait for it to appear in the downloads directory
expected_path = downloads_dir / suggested_filename
# Debug: List current directory contents
self.logger.debug(f'[DownloadsWatchdog] Downloads directory: {downloads_dir}')
if downloads_dir.exists():
files_before = list(downloads_dir.iterdir())
self.logger.debug(f'[DownloadsWatchdog] Files before download: {[f.name for f in files_before]}')
# Try manual JavaScript fetch as a fallback for local browsers (disabled for regular local downloads)
if self.browser_session.is_local and self._use_js_fetch_for_local:
self.logger.debug(f'[DownloadsWatchdog] Attempting JS fetch fallback for {download_url}')
unique_filename = None
file_size = None
download_result = None
try:
# Escape the URL for JavaScript
import json
escaped_url = json.dumps(download_url)
# Get the proper session for the frame that initiated the download
cdp_session = await self.browser_session.cdp_client_for_frame(event.get('frameId'))
assert cdp_session
result = await cdp_session.cdp_client.send.Runtime.evaluate(
params={
'expression': f"""
(async () => {{
try {{
const response = await fetch({escaped_url});
if (!response.ok) {{
throw new Error(`HTTP error! status: ${{response.status}}`);
}}
const blob = await response.blob();
const arrayBuffer = await blob.arrayBuffer();
const uint8Array = new Uint8Array(arrayBuffer);
return {{
data: Array.from(uint8Array),
size: uint8Array.length,
contentType: response.headers.get('content-type') || 'application/octet-stream'
}};
}} catch (error) {{
throw new Error(`Fetch failed: ${{error.message}}`);
}}
}})()
""",
'awaitPromise': True,
'returnByValue': True,
},
session_id=cdp_session.session_id,
)
download_result = result.get('result', {}).get('value')
if download_result and download_result.get('data'):
# Save the file
file_data = bytes(download_result['data'])
file_size = len(file_data)
# Ensure unique filename
unique_filename = await self._get_unique_filename(str(downloads_dir), suggested_filename)
final_path = downloads_dir / unique_filename
# Write the file
import anyio
async with await anyio.open_file(final_path, 'wb') as f:
await f.write(file_data)
self.logger.debug(f'[DownloadsWatchdog] ✅ Downloaded and saved file: {final_path} ({file_size} bytes)')
expected_path = final_path
# Emit download event immediately
file_ext = expected_path.suffix.lower().lstrip('.')
file_type = file_ext if file_ext else None
self.event_bus.dispatch(
FileDownloadedEvent(
guid=guid,
url=download_url,
path=str(expected_path),
file_name=unique_filename or expected_path.name,
file_size=file_size or 0,
file_type=file_type,
mime_type=(download_result.get('contentType') if download_result else None),
from_cache=False,
auto_download=False,
)
)
# Mark as handled to prevent duplicate dispatch from progress/polling paths
try:
if guid in self._cdp_downloads_info:
self._cdp_downloads_info[guid]['handled'] = True
except (KeyError, AttributeError):
pass
self.logger.debug(
f'[DownloadsWatchdog] ✅ File download completed via CDP: {suggested_filename} ({file_size} bytes) saved to {expected_path}'
)
return
else:
self.logger.error('[DownloadsWatchdog] ❌ No data received from fetch')
except Exception as fetch_error:
self.logger.error(f'[DownloadsWatchdog] ❌ Failed to download file via fetch: {fetch_error}')
# For remote browsers, don't poll local filesystem; downloadProgress handler will emit the event
if not self.browser_session.is_local:
return
@@ -925,24 +879,23 @@ class DownloadsWatchdog(BaseWatchdog):
# Poll the downloads directory for new files
self.logger.debug(f'[DownloadsWatchdog] Checking if browser auto-download saved the file for us: {suggested_filename}')
# Get initial list of files in downloads directory
initial_files = set()
if Path(downloads_dir).exists():
for f in Path(downloads_dir).iterdir():
if f.is_file() and not f.name.startswith('.'):
initial_files.add(f.name)
# Poll for new files
max_wait = 20 # seconds
start_time = asyncio.get_event_loop().time()
while asyncio.get_event_loop().time() - start_time < max_wait:
while asyncio.get_event_loop().time() - start_time < max_wait: # noqa: ASYNC110
await asyncio.sleep(5.0) # Check every 5 seconds
if Path(downloads_dir).exists():
for file_path in Path(downloads_dir).iterdir():
# Skip hidden files and files that were already there
if file_path.is_file() and not file_path.name.startswith('.') and file_path.name not in initial_files:
if (
file_path.is_file()
and not file_path.name.startswith('.')
and file_path.name not in self._initial_downloads_snapshot
):
# Add to snapshot immediately to prevent duplicate detection
self._initial_downloads_snapshot.add(file_path.name)
# Check if file has content (> 4 bytes)
try:
file_size = file_path.stat().st_size
@@ -971,13 +924,13 @@ class DownloadsWatchdog(BaseWatchdog):
file_type=file_type,
)
)
# Mark as handled after dispatch
try:
if guid in self._cdp_downloads_info:
self._cdp_downloads_info[guid]['handled'] = True
except (KeyError, AttributeError):
pass
return
# Mark as handled after dispatch
try:
if guid in self._cdp_downloads_info:
self._cdp_downloads_info[guid]['handled'] = True
except (KeyError, AttributeError):
pass
return
except Exception as e:
self.logger.debug(f'[DownloadsWatchdog] Error checking file {file_path}: {e}')

View File

@@ -665,7 +665,7 @@ class HarRecordingWatchdog(BaseWatchdog):
tmp_path = self._har_path.with_suffix(self._har_path.suffix + '.tmp')
# Write as bytes explicitly to avoid any text/binary mode confusion in different environments
tmp_path.write_bytes(json.dumps(har_obj, indent=2).encode('utf-8'))
tmp_path.write_bytes(json.dumps(har_obj, indent=2, ensure_ascii=False).encode('utf-8'))
tmp_path.replace(self._har_path)
def _format_page_started_datetime(self, timestamp: float | None) -> str:

View File

@@ -1,5 +1,7 @@
"""Local browser watchdog for managing browser subprocess lifecycle."""
from __future__ import annotations
import asyncio
import os
import shutil
@@ -21,7 +23,7 @@ from browser_use.browser.watchdog_base import BaseWatchdog
from browser_use.observability import observe_debug
if TYPE_CHECKING:
pass
from browser_use.browser.profile import BrowserChannel
class LocalBrowserWatchdog(BaseWatchdog):
@@ -124,8 +126,8 @@ class LocalBrowserWatchdog(BaseWatchdog):
self.logger.debug(f'[LocalBrowserWatchdog] 📦 Using custom local browser executable_path= {browser_path}')
else:
# self.logger.debug('[LocalBrowserWatchdog] 🔍 Looking for local browser binary path...')
# Try fallback paths first (system browsers preferred)
browser_path = self._find_installed_browser_path()
# Try fallback paths first (Playwright's Chromium preferred by default)
browser_path = self._find_installed_browser_path(channel=profile.channel)
if not browser_path:
self.logger.error(
'[LocalBrowserWatchdog] ⚠️ No local browser binary found, installing browser using playwright subprocess...'
@@ -215,14 +217,18 @@ class LocalBrowserWatchdog(BaseWatchdog):
raise RuntimeError(f'Failed to launch browser after {max_retries} attempts')
@staticmethod
def _find_installed_browser_path() -> str | None:
def _find_installed_browser_path(channel: BrowserChannel | None = None) -> str | None:
"""Try to find browser executable from common fallback locations.
If a channel is specified, paths for that browser are searched first.
Falls back to all known browser paths if the channel-specific search fails.
Prioritizes:
1. System Chrome Stable
1. Playwright chromium
2. Other system native browsers (Chromium -> Chrome Canary/Dev -> Brave)
3. Playwright headless-shell fallback
1. Channel-specific paths (if channel is set to a non-default value)
2. Playwright bundled Chromium (when no channel or default channel specified)
3. System Chrome stable
4. Other system native browsers (Chromium -> Chrome Canary/Dev -> Brave -> Edge)
5. Playwright headless-shell fallback
Returns:
Path to browser executable or None if not found
@@ -231,60 +237,90 @@ class LocalBrowserWatchdog(BaseWatchdog):
import platform
from pathlib import Path
from browser_use.browser.profile import BROWSERUSE_DEFAULT_CHANNEL, BrowserChannel
system = platform.system()
patterns = []
# Get playwright browsers path from environment variable if set
playwright_path = os.environ.get('PLAYWRIGHT_BROWSERS_PATH')
# Build tagged pattern lists per OS: (browser_group, path)
# browser_group is used to match against the requested channel
if system == 'Darwin': # macOS
if not playwright_path:
playwright_path = '~/Library/Caches/ms-playwright'
patterns = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
f'{playwright_path}/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
'/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
f'{playwright_path}/chromium_headless_shell-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
all_patterns = [
('chrome', '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'),
('chromium', f'{playwright_path}/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
('chromium', '/Applications/Chromium.app/Contents/MacOS/Chromium'),
('chrome-canary', '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'),
('brave', '/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'),
('msedge', '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'),
('chromium', f'{playwright_path}/chromium_headless_shell-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
]
elif system == 'Linux':
if not playwright_path:
playwright_path = '~/.cache/ms-playwright'
patterns = [
'/usr/bin/google-chrome-stable',
'/usr/bin/google-chrome',
'/usr/local/bin/google-chrome',
f'{playwright_path}/chromium-*/chrome-linux*/chrome',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
'/usr/local/bin/chromium',
'/snap/bin/chromium',
'/usr/bin/google-chrome-beta',
'/usr/bin/google-chrome-dev',
'/usr/bin/brave-browser',
f'{playwright_path}/chromium_headless_shell-*/chrome-linux*/chrome',
all_patterns = [
('chrome', '/usr/bin/google-chrome-stable'),
('chrome', '/usr/bin/google-chrome'),
('chrome', '/usr/local/bin/google-chrome'),
('chromium', f'{playwright_path}/chromium-*/chrome-linux*/chrome'),
('chromium', '/usr/bin/chromium'),
('chromium', '/usr/bin/chromium-browser'),
('chromium', '/usr/local/bin/chromium'),
('chromium', '/snap/bin/chromium'),
('chrome-beta', '/usr/bin/google-chrome-beta'),
('chrome-dev', '/usr/bin/google-chrome-dev'),
('brave', '/usr/bin/brave-browser'),
('msedge', '/usr/bin/microsoft-edge-stable'),
('msedge', '/usr/bin/microsoft-edge'),
('chromium', f'{playwright_path}/chromium_headless_shell-*/chrome-linux*/chrome'),
]
elif system == 'Windows':
if not playwright_path:
playwright_path = r'%LOCALAPPDATA%\ms-playwright'
patterns = [
r'C:\Program Files\Google\Chrome\Application\chrome.exe',
r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe',
r'%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe',
r'%PROGRAMFILES%\Google\Chrome\Application\chrome.exe',
r'%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe',
f'{playwright_path}\\chromium-*\\chrome-win\\chrome.exe',
r'C:\Program Files\Chromium\Application\chrome.exe',
r'C:\Program Files (x86)\Chromium\Application\chrome.exe',
r'%LOCALAPPDATA%\Chromium\Application\chrome.exe',
r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe',
r'C:\Program Files (x86)\BraveSoftware\Brave-Browser\Application\brave.exe',
r'C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe',
r'C:\Program Files\Microsoft\Edge\Application\msedge.exe',
r'%LOCALAPPDATA%\Microsoft\Edge\Application\msedge.exe',
f'{playwright_path}\\chromium_headless_shell-*\\chrome-win\\chrome.exe',
all_patterns = [
('chrome', r'C:\Program Files\Google\Chrome\Application\chrome.exe'),
('chrome', r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'),
('chrome', r'%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe'),
('chrome', r'%PROGRAMFILES%\Google\Chrome\Application\chrome.exe'),
('chrome', r'%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe'),
('chromium', f'{playwright_path}\\chromium-*\\chrome-win\\chrome.exe'),
('chromium', r'C:\Program Files\Chromium\Application\chrome.exe'),
('chromium', r'C:\Program Files (x86)\Chromium\Application\chrome.exe'),
('chromium', r'%LOCALAPPDATA%\Chromium\Application\chrome.exe'),
('brave', r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'),
('brave', r'C:\Program Files (x86)\BraveSoftware\Brave-Browser\Application\brave.exe'),
('msedge', r'C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe'),
('msedge', r'C:\Program Files\Microsoft\Edge\Application\msedge.exe'),
('msedge', r'%LOCALAPPDATA%\Microsoft\Edge\Application\msedge.exe'),
('chromium', f'{playwright_path}\\chromium_headless_shell-*\\chrome-win\\chrome.exe'),
]
else:
all_patterns = []
# Map channel enum values to browser group tags
_channel_to_group: dict[BrowserChannel, str] = {
BrowserChannel.CHROME: 'chrome',
BrowserChannel.CHROME_BETA: 'chrome-beta',
BrowserChannel.CHROME_DEV: 'chrome-dev',
BrowserChannel.CHROME_CANARY: 'chrome-canary',
BrowserChannel.CHROMIUM: 'chromium',
BrowserChannel.MSEDGE: 'msedge',
BrowserChannel.MSEDGE_BETA: 'msedge',
BrowserChannel.MSEDGE_DEV: 'msedge',
BrowserChannel.MSEDGE_CANARY: 'msedge',
}
# Prioritize the target browser group, then fall back to the rest.
if channel and channel != BROWSERUSE_DEFAULT_CHANNEL and channel in _channel_to_group:
target_group = _channel_to_group[channel]
else:
target_group = _channel_to_group[BROWSERUSE_DEFAULT_CHANNEL]
prioritized = [p for g, p in all_patterns if g == target_group]
rest = [p for g, p in all_patterns if g != target_group]
patterns = prioritized + rest
for pattern in patterns:
# Expand user home directory
@@ -326,7 +362,7 @@ class LocalBrowserWatchdog(BaseWatchdog):
import platform
# Build command - only use --with-deps on Linux (it fails on Windows/macOS)
cmd = ['uvx', 'playwright', 'install', 'chrome']
cmd = ['uvx', 'playwright', 'install', 'chromium']
if platform.system() == 'Linux':
cmd.append('--with-deps')
@@ -344,7 +380,7 @@ class LocalBrowserWatchdog(BaseWatchdog):
if browser_path:
return browser_path
self.logger.error(f'[LocalBrowserWatchdog] ❌ Playwright local browser installation error: \n{stdout}\n{stderr}')
raise RuntimeError('No local browser path found after: uvx playwright install chrome')
raise RuntimeError('No local browser path found after: uvx playwright install chromium')
except TimeoutError:
# Kill the subprocess if it times out
process.kill()

View File

@@ -52,8 +52,26 @@ class ScreenshotWatchdog(BaseWatchdog):
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=True)
# Remove highlights BEFORE taking the screenshot so they don't appear in the image.
# Done here (not in finally) so CancelledError is never swallowed — any await in a
# finally block can suppress external task cancellation.
# remove_highlights() has its own asyncio.timeout(3.0) internally so it won't block.
try:
await self.browser_session.remove_highlights()
except Exception:
pass
# Prepare screenshot parameters
params = CaptureScreenshotParameters(format='png', captureBeyondViewport=False)
params_dict: dict[str, Any] = {'format': 'png', 'captureBeyondViewport': event.full_page}
if event.clip:
params_dict['clip'] = {
'x': event.clip['x'],
'y': event.clip['y'],
'width': event.clip['width'],
'height': event.clip['height'],
'scale': 1,
}
params = CaptureScreenshotParameters(**params_dict)
# Take screenshot using CDP
self.logger.debug(f'[ScreenshotWatchdog] Taking screenshot with params: {params}')
@@ -68,9 +86,3 @@ class ScreenshotWatchdog(BaseWatchdog):
except Exception as e:
self.logger.error(f'[ScreenshotWatchdog] Screenshot failed: {e}')
raise
finally:
# Try to remove highlights even on failure
try:
await self.browser_session.remove_highlights()
except Exception:
pass

View File

@@ -68,7 +68,6 @@ class SecurityWatchdog(BaseWatchdog):
await session.cdp_client.send.Page.navigate(params={'url': 'about:blank'}, session_id=session.session_id)
self.logger.info(f'⛔️ Navigated to about:blank after blocked URL: {event.url}')
except Exception as e:
pass
self.logger.error(f'⛔️ Failed to navigate to about:blank: {type(e).__name__} {e}')
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:

View File

@@ -202,7 +202,7 @@ class StorageStateWatchdog(BaseWatchdog):
# Write atomically
temp_path = json_path.with_suffix('.json.tmp')
temp_path.write_text(json.dumps(merged_state, indent=4))
temp_path.write_text(json.dumps(merged_state, indent=4, ensure_ascii=False), encoding='utf-8')
# Backup existing file
if json_path.exists():
@@ -249,25 +249,60 @@ class StorageStateWatchdog(BaseWatchdog):
# Apply cookies if present
if 'cookies' in storage and storage['cookies']:
await self.browser_session._cdp_set_cookies(storage['cookies'])
# Playwright exports session cookies with expires=0/-1. CDP treats expires=0 as expired.
# Normalize session cookies by omitting expires
normalized_cookies: list[Cookie] = []
for cookie in storage['cookies']:
if not isinstance(cookie, dict):
normalized_cookies.append(cookie) # type: ignore[arg-type]
continue
c = dict(cookie)
expires = c.get('expires')
if expires in (0, 0.0, -1, -1.0):
c.pop('expires', None)
normalized_cookies.append(Cookie(**c))
await self.browser_session._cdp_set_cookies(normalized_cookies)
self._last_cookie_state = storage['cookies'].copy()
self.logger.debug(f'[StorageStateWatchdog] Added {len(storage["cookies"])} cookies from storage state')
# Apply origins (localStorage/sessionStorage) if present
if 'origins' in storage and storage['origins']:
for origin in storage['origins']:
if 'localStorage' in origin:
origin_value = origin.get('origin')
if not origin_value:
continue
# Scope storage restoration to its origin to avoid cross-site pollution.
if origin.get('localStorage'):
lines = []
for item in origin['localStorage']:
script = f"""
window.localStorage.setItem({json.dumps(item['name'])}, {json.dumps(item['value'])});
"""
await self.browser_session._cdp_add_init_script(script)
if 'sessionStorage' in origin:
lines.append(f'window.localStorage.setItem({json.dumps(item["name"])}, {json.dumps(item["value"])});')
script = (
'(function(){\n'
f' if (window.location && window.location.origin !== {json.dumps(origin_value)}) return;\n'
' try {\n'
f' {" ".join(lines)}\n'
' } catch (e) {}\n'
'})();'
)
await self.browser_session._cdp_add_init_script(script)
if origin.get('sessionStorage'):
lines = []
for item in origin['sessionStorage']:
script = f"""
window.sessionStorage.setItem({json.dumps(item['name'])}, {json.dumps(item['value'])});
"""
await self.browser_session._cdp_add_init_script(script)
lines.append(
f'window.sessionStorage.setItem({json.dumps(item["name"])}, {json.dumps(item["value"])});'
)
script = (
'(function(){\n'
f' if (window.location && window.location.origin !== {json.dumps(origin_value)}) return;\n'
' try {\n'
f' {" ".join(lines)}\n'
' } catch (e) {}\n'
'})();'
)
await self.browser_session._cdp_add_init_script(script)
self.logger.debug(
f'[StorageStateWatchdog] Applied localStorage/sessionStorage from {len(storage["origins"])} origins'
)

View File

@@ -129,7 +129,7 @@ if '--template' in sys.argv:
click.echo(' uv pip install browser-use')
click.echo(' 2. Set up your API key in .env file or environment:')
click.echo(' BROWSER_USE_API_KEY=your-key')
click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key)')
click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=cli)')
click.echo(' 3. Run your script:')
click.echo(f' python {output_path.name}')
except Exception as e:
@@ -178,9 +178,12 @@ except ImportError:
try:
import readline
_add_history = getattr(readline, 'add_history', None)
if _add_history is None:
raise ImportError('readline missing add_history')
READLINE_AVAILABLE = True
except ImportError:
# readline not available on Windows by default
_add_history = None
READLINE_AVAILABLE = False
@@ -294,8 +297,8 @@ def save_user_config(config: dict[str, Any]) -> None:
# Save to separate history file
history_file = CONFIG.BROWSER_USE_CONFIG_DIR / 'command_history.json'
with open(history_file, 'w') as f:
json.dump(history, f, indent=2)
with open(history_file, 'w', encoding='utf-8') as f:
json.dump(history, f, indent=2, ensure_ascii=False)
def update_config_with_click_args(config: dict[str, Any], ctx: click.Context) -> dict[str, Any]:
@@ -341,12 +344,11 @@ def update_config_with_click_args(config: dict[str, Any], ctx: click.Context) ->
def setup_readline_history(history: list[str]) -> None:
"""Set up readline with command history."""
if not READLINE_AVAILABLE:
if not _add_history:
return
# Add history items to readline
for item in history:
readline.add_history(item)
_add_history(item)
def get_llm(config: dict[str, Any]):
@@ -694,8 +696,6 @@ class BrowserUseApp(App):
'trafilatura.htmlprocessing',
'trafilatura',
'groq',
'portalocker',
'portalocker.utils',
]:
third_party = logging.getLogger(logger_name)
third_party.setLevel(logging.ERROR)
@@ -720,9 +720,9 @@ class BrowserUseApp(App):
# Step 2: Set up input history
logger.debug('Setting up readline history...')
try:
if READLINE_AVAILABLE and self.task_history:
if READLINE_AVAILABLE and self.task_history and _add_history:
for item in self.task_history:
readline.add_history(item)
_add_history(item)
logger.debug(f'Added {len(self.task_history)} items to readline history')
else:
logger.debug('No readline history to set up')
@@ -1129,7 +1129,7 @@ class BrowserUseApp(App):
# Exit the application
self.exit()
print('\nTry running tasks on our cloud: https://browser-use.com')
print('\nTry running tasks on our cloud: https://browser-use.com?utm_source=oss&utm_medium=cli')
def compose(self) -> ComposeResult:
"""Create the UI layout."""
@@ -1144,7 +1144,11 @@ class BrowserUseApp(App):
with Container(id='links-panel'):
with HorizontalGroup(classes='link-row'):
yield Static('Run at scale on cloud: [blink]☁️[/] ', markup=True, classes='link-label')
yield Link('https://browser-use.com', url='https://browser-use.com', classes='link-white link-url')
yield Link(
'https://browser-use.com',
url='https://browser-use.com?utm_source=oss&utm_medium=cli',
classes='link-white link-url',
)
yield Static('') # Empty line
@@ -2224,7 +2228,7 @@ def _run_template_generation(template: str, output: str | None, force: bool):
click.echo(' uv pip install browser-use')
click.echo(' 2. Set up your API key in .env file or environment:')
click.echo(' BROWSER_USE_API_KEY=your-key')
click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key)')
click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=cli)')
click.echo(' 3. Run your script:')
click.echo(f' python {output_path.name}')
else:
@@ -2353,7 +2357,7 @@ def init(
click.echo(' uv pip install browser-use')
click.echo(' 2. Set up your API key in .env file or environment:')
click.echo(' BROWSER_USE_API_KEY=your-key')
click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key)')
click.echo(' (Get your key at https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=cli)')
click.echo(' 3. Run your script:')
click.echo(f' python {output_path.name}')
else:

View File

@@ -1,84 +0,0 @@
# Code-Use Mode
Code-Use Mode is a Notebook-like code execution system for browser automation. Instead of the agent choosing from a predefined set of actions, the LLM writes Python code that gets executed in a persistent namespace with all browser control functions available.
## Problem Solved
**Code-Use Mode solves this** by giving the agent a Python execution environment where it can:
- Store extracted data in variables
- Loop through pages programmatically
- Combine results from multiple extractions
- Process and filter data before saving
- Use conditional logic to decide what to do next
- Output more tokens than the LLM writes
### Namespace
The namespace is initialized with:
**Browser Control Functions:**
- `navigate(url)` - Navigate to a URL
- `click(index)` - Click an element
- `input(index, text)` - Type text
- `scroll(down, pages)` - Scroll the page
- `upload_file(path)` - Upload a file
- `evaluate(code, variables={})` - Execute JavaScript
- `done(text, success, files_to_display=[])` - Mark task complete
**Custom evaluate() Function:**
```python
# Returns values directly, not wrapped in ActionResult
result = await evaluate('''
(function(){
return Array.from(document.querySelectorAll('.product')).map(p => ({
name: p.querySelector('.name').textContent,
price: p.querySelector('.price').textContent
}))
})()
''')
# result is now a list of dicts, ready to use!
```
**Utilities:**
The agent can just utilize packages like `requests`, `pandas`, `numpy`, `matplotlib`, `BeautifulSoup`, `tabulate`, `csv`, ...
The agent will write code like:
### Step 1: Navigate
```python
# Navigate to first page
await navigate(url='https://example.com/products?page=1')
```
### Step 2 analyse our DOM state and write code to extract the data we need.
```js extract_products
(function(){
return Array.from(document.querySelectorAll('.product')).map(p => ({
name: p.querySelector('.name')?.textContent || '',
price: p.querySelector('.price')?.textContent || '',
rating: p.querySelector('.rating')?.textContent || ''
}))
})()
```
```python
# Extract products using JavaScript
all_products = []
for page in range(1, 6):
if page > 1:
await navigate(url=f'https://example.com/products?page={page}')
products = await evaluate(extract_products)
all_products.extend(products)
print(f'Page {page}: Found {len(products)} products')
```
### Step 3: Analyse output & save the data to a file
```python
# Save to file
import json
with open('products.json', 'w') as f:
json.dump(all_products, f, indent=2)
print(f'Total: {len(all_products)} products saved to products.json')
await done(text='Extracted all products', success=True, files_to_display=['products.json'])
```

View File

@@ -1,16 +0,0 @@
"""Code-use mode - Jupyter notebook-like code execution for browser automation."""
from browser_use.code_use.namespace import create_namespace
from browser_use.code_use.notebook_export import export_to_ipynb, session_to_python_script
from browser_use.code_use.service import CodeAgent
from browser_use.code_use.views import CodeCell, ExecutionStatus, NotebookSession
__all__ = [
'CodeAgent',
'create_namespace',
'export_to_ipynb',
'session_to_python_script',
'CodeCell',
'ExecutionStatus',
'NotebookSession',
]

View File

@@ -1,190 +0,0 @@
"""Browser state formatting helpers for code-use agent."""
import logging
from typing import Any
from browser_use.browser.session import BrowserSession
from browser_use.browser.views import BrowserStateSummary
logger = logging.getLogger(__name__)
async def format_browser_state_for_llm(
state: BrowserStateSummary,
namespace: dict[str, Any],
browser_session: BrowserSession,
) -> str:
"""
Format browser state summary for LLM consumption in code-use mode.
Args:
state: Browser state summary from browser_session.get_browser_state_summary()
namespace: The code execution namespace (for showing available variables)
browser_session: Browser session for additional checks (jQuery, etc.)
Returns:
Formatted browser state text for LLM
"""
assert state.dom_state is not None
dom_state = state.dom_state
# Use eval_representation (compact serializer for code agents)
dom_html = dom_state.eval_representation()
if dom_html == '':
dom_html = 'Empty DOM tree (you might have to wait for the page to load)'
# Format with URL and title header
lines = ['## Browser State']
lines.append(f'**URL:** {state.url}')
lines.append(f'**Title:** {state.title}')
lines.append('')
# Add tabs info if multiple tabs exist
if len(state.tabs) > 1:
lines.append('**Tabs:**')
current_target_candidates = []
# Find tabs that match current URL and title
for tab in state.tabs:
if tab.url == state.url and tab.title == state.title:
current_target_candidates.append(tab.target_id)
current_target_id = current_target_candidates[0] if len(current_target_candidates) == 1 else None
for tab in state.tabs:
is_current = ' (current)' if tab.target_id == current_target_id else ''
lines.append(f' - Tab {tab.target_id[-4:]}: {tab.url} - {tab.title[:30]}{is_current}')
lines.append('')
# Add page scroll info if available
if state.page_info:
pi = state.page_info
pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
scroll_info = f'**Page:** {pages_above:.1f} pages above, {pages_below:.1f} pages below'
if total_pages > 1.2: # Only mention total if significantly > 1 page
scroll_info += f', {total_pages:.1f} total pages'
lines.append(scroll_info)
lines.append('')
# Add network loading info if there are pending requests
if state.pending_network_requests:
# Remove duplicates by URL (keep first occurrence with earliest duration)
seen_urls = set()
unique_requests = []
for req in state.pending_network_requests:
if req.url not in seen_urls:
seen_urls.add(req.url)
unique_requests.append(req)
lines.append(f'**⏳ Loading:** {len(unique_requests)} network requests still loading')
# Show up to 20 unique requests with truncated URLs (30 chars max)
for req in unique_requests[:20]:
duration_sec = req.loading_duration_ms / 1000
url_display = req.url if len(req.url) <= 30 else req.url[:27] + '...'
logger.info(f' - [{duration_sec:.1f}s] {url_display}')
lines.append(f' - [{duration_sec:.1f}s] {url_display}')
if len(unique_requests) > 20:
lines.append(f' - ... and {len(unique_requests) - 20} more')
lines.append('**Tip:** Content may still be loading. Consider waiting with `await asyncio.sleep(1)` if data is missing.')
lines.append('')
# Add available variables and functions BEFORE DOM structure
# Show useful utilities (json, asyncio, etc.) and user-defined vars, but hide system objects
skip_vars = {
'browser',
'file_system', # System objects
'np',
'pd',
'plt',
'numpy',
'pandas',
'matplotlib',
'requests',
'BeautifulSoup',
'bs4',
'pypdf',
'PdfReader',
'wait',
}
# Highlight code block variables separately from regular variables
code_block_vars = []
regular_vars = []
tracked_code_blocks = namespace.get('_code_block_vars', set())
for name in namespace.keys():
# Skip private vars and system objects/actions
if not name.startswith('_') and name not in skip_vars:
if name in tracked_code_blocks:
code_block_vars.append(name)
else:
regular_vars.append(name)
# Sort for consistent display
available_vars_sorted = sorted(regular_vars)
code_block_vars_sorted = sorted(code_block_vars)
# Build available line with code blocks and variables
parts = []
if code_block_vars_sorted:
# Show detailed info for code block variables
code_block_details = []
for var_name in code_block_vars_sorted:
value = namespace.get(var_name)
if value is not None:
type_name = type(value).__name__
value_str = str(value) if not isinstance(value, str) else value
# Check if it's a function (starts with "(function" or "(async function")
is_function = value_str.strip().startswith('(function') or value_str.strip().startswith('(async function')
if is_function:
# For functions, only show name and type
detail = f'{var_name}({type_name})'
else:
# For non-functions, show first and last 20 chars
first_20 = value_str[:20].replace('\n', '\\n').replace('\t', '\\t')
last_20 = value_str[-20:].replace('\n', '\\n').replace('\t', '\\t') if len(value_str) > 20 else ''
if last_20 and first_20 != last_20:
detail = f'{var_name}({type_name}): "{first_20}...{last_20}"'
else:
detail = f'{var_name}({type_name}): "{first_20}"'
code_block_details.append(detail)
parts.append(f'**Code block variables:** {" | ".join(code_block_details)}')
if available_vars_sorted:
parts.append(f'**Variables:** {", ".join(available_vars_sorted)}')
lines.append(f'**Available:** {" | ".join(parts)}')
lines.append('')
# Add DOM structure
lines.append('**DOM Structure:**')
# Add scroll position hints for DOM
if state.page_info:
pi = state.page_info
pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
if pages_above > 0:
dom_html = f'... {pages_above:.1f} pages above \n{dom_html}'
else:
dom_html = '[Start of page]\n' + dom_html
if pages_below <= 0:
dom_html += '\n[End of page]'
# Truncate DOM if too long and notify LLM
max_dom_length = 60000
if len(dom_html) > max_dom_length:
lines.append(dom_html[:max_dom_length])
lines.append(
f'\n[DOM truncated after {max_dom_length} characters. Full page contains {len(dom_html)} characters total. Use evaluate to explore more.]'
)
else:
lines.append(dom_html)
browser_state_text = '\n'.join(lines)
return browser_state_text

View File

@@ -1,665 +0,0 @@
"""Namespace initialization for code-use mode.
This module creates a namespace with all browser tools available as functions,
similar to a Jupyter notebook environment.
"""
import asyncio
import csv
import datetime
import json
import logging
import re
from pathlib import Path
from typing import Any
import requests
from browser_use.browser import BrowserSession
from browser_use.filesystem.file_system import FileSystem
from browser_use.llm.base import BaseChatModel
from browser_use.tools.service import CodeAgentTools, Tools
logger = logging.getLogger(__name__)
# Try to import optional data science libraries
try:
import numpy as np # type: ignore
NUMPY_AVAILABLE = True
except ImportError:
NUMPY_AVAILABLE = False
try:
import pandas as pd # type: ignore
PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False
try:
import matplotlib.pyplot as plt # type: ignore
MATPLOTLIB_AVAILABLE = True
except ImportError:
MATPLOTLIB_AVAILABLE = False
try:
from bs4 import BeautifulSoup # type: ignore
BS4_AVAILABLE = True
except ImportError:
BS4_AVAILABLE = False
try:
from pypdf import PdfReader # type: ignore
PYPDF_AVAILABLE = True
except ImportError:
PYPDF_AVAILABLE = False
try:
from tabulate import tabulate # type: ignore
TABULATE_AVAILABLE = True
except ImportError:
TABULATE_AVAILABLE = False
def _strip_js_comments(js_code: str) -> str:
"""
Remove JavaScript comments before CDP evaluation.
CDP's Runtime.evaluate doesn't handle comments in all contexts.
Args:
js_code: JavaScript code potentially containing comments
Returns:
JavaScript code with comments stripped
"""
# Remove multi-line comments (/* ... */)
js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)
# Remove single-line comments - only lines that START with // (after whitespace)
# This avoids breaking XPath strings, URLs, regex patterns, etc.
js_code = re.sub(r'^\s*//.*$', '', js_code, flags=re.MULTILINE)
return js_code
class EvaluateError(Exception):
"""Special exception raised by evaluate() to stop Python execution immediately."""
pass
async def validate_task_completion(
task: str,
output: str | None,
llm: BaseChatModel,
) -> tuple[bool, str]:
"""
Validate if task is truly complete by asking LLM without system prompt or history.
Args:
task: The original task description
output: The output from the done() call
llm: The LLM to use for validation
Returns:
Tuple of (is_complete, reasoning)
"""
from browser_use.llm.messages import UserMessage
# Build validation prompt
validation_prompt = f"""You are a task completion validator. Analyze if the agent has truly completed the user's task.
**Original Task:**
{task}
**Agent's Output:**
{output[:100000] if output else '(No output provided)'}
**Your Task:**
Determine if the agent has successfully completed the user's task. Consider:
1. Has the agent delivered what the user requested?
2. If data extraction was requested, is there actual data?
3. If the task is impossible (e.g., localhost website, login required but no credentials), is it truly impossible?
4. Could the agent continue and make meaningful progress?
**Response Format:**
Reasoning: [Your analysis of whether the task is complete]
Verdict: [YES or NO]
YES = Task is complete OR truly impossible to complete
NO = Agent should continue working"""
try:
# Call LLM with just the validation prompt (no system prompt, no history)
response = await llm.ainvoke([UserMessage(content=validation_prompt)])
response_text = response.completion
# Parse the response
reasoning = ''
verdict = 'NO'
# Extract reasoning and verdict
lines = response_text.split('\n')
for line in lines:
if line.strip().lower().startswith('reasoning:'):
reasoning = line.split(':', 1)[1].strip()
elif line.strip().lower().startswith('verdict:'):
verdict_text = line.split(':', 1)[1].strip().upper()
if 'YES' in verdict_text:
verdict = 'YES'
elif 'NO' in verdict_text:
verdict = 'NO'
# If we couldn't parse, try to find YES/NO in the response
if not reasoning:
reasoning = response_text
is_complete = verdict == 'YES'
logger.info(f'Task validation: {verdict}')
logger.debug(f'Validation reasoning: {reasoning}')
return is_complete, reasoning
except Exception as e:
logger.warning(f'Failed to validate task completion: {e}')
# On error, assume the agent knows what they're doing
return True, f'Validation failed: {e}'
async def evaluate(code: str, browser_session: BrowserSession) -> Any:
"""
Execute JavaScript code in the browser and return the result.
Args:
code: JavaScript code to execute (must be wrapped in IIFE)
Returns:
The result of the JavaScript execution
Raises:
EvaluateError: If JavaScript execution fails. This stops Python execution immediately.
Example:
result = await evaluate('''
(function(){
return Array.from(document.querySelectorAll('.product')).map(p => ({
name: p.querySelector('.name').textContent,
price: p.querySelector('.price').textContent
}))
})()
''')
"""
# Strip JavaScript comments before CDP evaluation (CDP doesn't support them in all contexts)
code = _strip_js_comments(code)
cdp_session = await browser_session.get_or_create_cdp_session()
try:
# Execute JavaScript with proper error handling
result = await cdp_session.cdp_client.send.Runtime.evaluate(
params={'expression': code, 'returnByValue': True, 'awaitPromise': True},
session_id=cdp_session.session_id,
)
# Check for JavaScript execution errors
if result.get('exceptionDetails'):
exception = result['exceptionDetails']
error_text = exception.get('text', 'Unknown error')
# Try to get more details from the exception
error_details = []
if 'exception' in exception:
exc_obj = exception['exception']
if 'description' in exc_obj:
error_details.append(exc_obj['description'])
elif 'value' in exc_obj:
error_details.append(str(exc_obj['value']))
# Build comprehensive error message with full CDP context
error_msg = f'JavaScript execution error: {error_text}'
if error_details:
error_msg += f'\nDetails: {" | ".join(error_details)}'
# Raise special exception that will stop Python execution immediately
raise EvaluateError(error_msg)
# Get the result data
result_data = result.get('result', {})
# Get the actual value
value = result_data.get('value')
# Return the value directly
if value is None:
return None if 'value' in result_data else 'undefined'
elif isinstance(value, (dict, list)):
# Complex objects - already deserialized by returnByValue
return value
else:
# Primitive values
return value
except EvaluateError:
# Re-raise EvaluateError as-is to stop Python execution
raise
except Exception as e:
# Wrap other exceptions in EvaluateError
raise EvaluateError(f'Failed to execute JavaScript: {type(e).__name__}: {e}') from e
def create_namespace(
browser_session: BrowserSession,
tools: Tools | None = None,
page_extraction_llm: BaseChatModel | None = None,
file_system: FileSystem | None = None,
available_file_paths: list[str] | None = None,
sensitive_data: dict[str, str | dict[str, str]] | None = None,
) -> dict[str, Any]:
"""
Create a namespace with all browser tools available as functions.
This function creates a dictionary of functions that can be used to interact
with the browser, similar to a Jupyter notebook environment.
Args:
browser_session: The browser session to use
tools: Optional Tools instance (will create default if not provided)
page_extraction_llm: Optional LLM for page extraction
file_system: Optional file system for file operations
available_file_paths: Optional list of available file paths
sensitive_data: Optional sensitive data dictionary
Returns:
Dictionary containing all available functions and objects
Example:
namespace = create_namespace(browser_session)
await namespace['navigate'](url='https://google.com')
result = await namespace['evaluate']('document.title')
"""
if tools is None:
# Use CodeAgentTools with default exclusions optimized for code-use mode
# For code-use, we keep: navigate, evaluate, wait, done
# and exclude: most browser interaction, file system actions (use Python instead)
tools = CodeAgentTools()
if available_file_paths is None:
available_file_paths = []
namespace: dict[str, Any] = {
# Core objects
'browser': browser_session,
'file_system': file_system,
# Standard library modules (always available)
'json': json,
'asyncio': asyncio,
'Path': Path,
'csv': csv,
're': re,
'datetime': datetime,
'requests': requests,
}
# Add optional data science libraries if available
if NUMPY_AVAILABLE:
namespace['np'] = np
namespace['numpy'] = np
if PANDAS_AVAILABLE:
namespace['pd'] = pd
namespace['pandas'] = pd
if MATPLOTLIB_AVAILABLE:
namespace['plt'] = plt
namespace['matplotlib'] = plt
if BS4_AVAILABLE:
namespace['BeautifulSoup'] = BeautifulSoup
namespace['bs4'] = BeautifulSoup
if PYPDF_AVAILABLE:
namespace['PdfReader'] = PdfReader
namespace['pypdf'] = PdfReader
if TABULATE_AVAILABLE:
namespace['tabulate'] = tabulate
# Track failed evaluate() calls to detect repeated failed approaches
if '_evaluate_failures' not in namespace:
namespace['_evaluate_failures'] = []
# Add custom evaluate function that returns values directly
async def evaluate_wrapper(
code: str | None = None, variables: dict[str, Any] | None = None, *_args: Any, **kwargs: Any
) -> Any:
# Handle both positional and keyword argument styles
if code is None:
# Check if code was passed as keyword arg
code = kwargs.get('code', kwargs.get('js_code', kwargs.get('expression', '')))
# Extract variables if passed as kwarg
if variables is None:
variables = kwargs.get('variables')
if not code:
raise ValueError('No JavaScript code provided to evaluate()')
# Inject variables if provided
if variables:
vars_json = json.dumps(variables)
stripped = code.strip()
# Check if code is already a function expression expecting params
# Pattern: (function(params) { ... }) or (async function(params) { ... })
if re.match(r'\((?:async\s+)?function\s*\(\s*\w+\s*\)', stripped):
# Already expects params, wrap to call it with our variables
code = f'(function(){{ const params = {vars_json}; return {stripped}(params); }})()'
else:
# Not a parameterized function, inject params in scope
# Check if already wrapped in IIFE (including arrow function IIFEs)
is_wrapped = (
(stripped.startswith('(function()') and '})()' in stripped[-10:])
or (stripped.startswith('(async function()') and '})()' in stripped[-10:])
or (stripped.startswith('(() =>') and ')()' in stripped[-10:])
or (stripped.startswith('(async () =>') and ')()' in stripped[-10:])
)
if is_wrapped:
# Already wrapped, inject params at the start
# Try to match regular function IIFE
match = re.match(r'(\((?:async\s+)?function\s*\(\s*\)\s*\{)', stripped)
if match:
prefix = match.group(1)
rest = stripped[len(prefix) :]
code = f'{prefix} const params = {vars_json}; {rest}'
else:
# Try to match arrow function IIFE
# Patterns: (() => expr)() or (() => { ... })() or (async () => ...)()
arrow_match = re.match(r'(\((?:async\s+)?\(\s*\)\s*=>\s*\{)', stripped)
if arrow_match:
# Arrow function with block body: (() => { ... })()
prefix = arrow_match.group(1)
rest = stripped[len(prefix) :]
code = f'{prefix} const params = {vars_json}; {rest}'
else:
# Arrow function with expression body or fallback: wrap in outer function
code = f'(function(){{ const params = {vars_json}; return {stripped}; }})()'
else:
# Not wrapped, wrap with params
code = f'(function(){{ const params = {vars_json}; {code} }})()'
# Skip auto-wrap below
return await evaluate(code, browser_session)
# Auto-wrap in IIFE if not already wrapped (and no variables were injected)
if not variables:
stripped = code.strip()
# Check for regular function IIFEs, async function IIFEs, and arrow function IIFEs
is_wrapped = (
(stripped.startswith('(function()') and '})()' in stripped[-10:])
or (stripped.startswith('(async function()') and '})()' in stripped[-10:])
or (stripped.startswith('(() =>') and ')()' in stripped[-10:])
or (stripped.startswith('(async () =>') and ')()' in stripped[-10:])
)
if not is_wrapped:
code = f'(function(){{{code}}})()'
# Execute and track failures
try:
result = await evaluate(code, browser_session)
# Print result structure for debugging
if isinstance(result, list) and result and isinstance(result[0], dict):
result_preview = f'list of dicts - len={len(result)}, example 1:\n'
sample_result = result[0]
for key, value in list(sample_result.items())[:10]:
value_str = str(value)[:10] if not isinstance(value, (int, float, bool, type(None))) else str(value)
result_preview += f' {key}: {value_str}...\n'
if len(sample_result) > 10:
result_preview += f' ... {len(sample_result) - 10} more keys'
print(result_preview)
elif isinstance(result, list):
if len(result) == 0:
print('type=list, len=0')
else:
result_preview = str(result)[:100]
print(f'type=list, len={len(result)}, preview={result_preview}...')
elif isinstance(result, dict):
result_preview = f'type=dict, len={len(result)}, sample keys:\n'
for key, value in list(result.items())[:10]:
value_str = str(value)[:10] if not isinstance(value, (int, float, bool, type(None))) else str(value)
result_preview += f' {key}: {value_str}...\n'
if len(result) > 10:
result_preview += f' ... {len(result) - 10} more keys'
print(result_preview)
else:
print(f'type={type(result).__name__}, value={repr(result)[:50]}')
return result
except Exception as e:
# Track errors for pattern detection
namespace['_evaluate_failures'].append({'error': str(e), 'type': 'exception'})
raise
namespace['evaluate'] = evaluate_wrapper
# Add get_selector_from_index helper for code_use mode
async def get_selector_from_index_wrapper(index: int) -> str:
"""
Get the CSS selector for an element by its interactive index.
This allows you to use the element's index from the browser state to get
its CSS selector for use in JavaScript evaluate() calls.
Args:
index: The interactive index from the browser state (e.g., [123])
Returns:
str: CSS selector that can be used in JavaScript
Example:
selector = await get_selector_from_index(123)
await evaluate(f'''
(function(){{
const el = document.querySelector({json.dumps(selector)});
if (el) el.click();
}})()
''')
"""
from browser_use.dom.utils import generate_css_selector_for_element
# Get element by index from browser session
node = await browser_session.get_element_by_index(index)
if node is None:
msg = f'Element index {index} not available - page may have changed. Try refreshing browser state.'
logger.warning(f'⚠️ {msg}')
raise RuntimeError(msg)
# Check if element is in shadow DOM
shadow_hosts = []
current = node.parent_node
while current:
if current.shadow_root_type is not None:
# This is a shadow host
host_tag = current.tag_name.lower()
host_id = current.attributes.get('id', '') if current.attributes else ''
host_desc = f'{host_tag}#{host_id}' if host_id else host_tag
shadow_hosts.insert(0, host_desc)
current = current.parent_node
# Check if in iframe
in_iframe = False
current = node.parent_node
while current:
if current.tag_name.lower() == 'iframe':
in_iframe = True
break
current = current.parent_node
# Use the robust selector generation function (now handles special chars in IDs)
selector = generate_css_selector_for_element(node)
# Log shadow DOM/iframe info if detected
if shadow_hosts:
shadow_path = ' > '.join(shadow_hosts)
logger.info(f'Element [{index}] is inside Shadow DOM. Path: {shadow_path}')
logger.info(f' Selector: {selector}')
logger.info(
f' To access: document.querySelector("{shadow_hosts[0].split("#")[0]}").shadowRoot.querySelector("{selector}")'
)
if in_iframe:
logger.info(f"Element [{index}] is inside an iframe. Regular querySelector won't work.")
if selector:
return selector
# Fallback: just use tag name if available
if node.tag_name:
return node.tag_name.lower()
raise ValueError(f'Could not generate selector for element index {index}')
namespace['get_selector_from_index'] = get_selector_from_index_wrapper
# Inject all tools as functions into the namespace
# Skip 'evaluate' since we have a custom implementation above
for action_name, action in tools.registry.registry.actions.items():
if action_name == 'evaluate':
continue # Skip - use custom evaluate that returns Python objects directly
param_model = action.param_model
action_function = action.function
# Create a closure to capture the current action_name, param_model, and action_function
def make_action_wrapper(act_name, par_model, act_func):
async def action_wrapper(*args, **kwargs):
# Convert positional args to kwargs based on param model fields
if args:
# Get the field names from the pydantic model
field_names = list(par_model.model_fields.keys())
for i, arg in enumerate(args):
if i < len(field_names):
kwargs[field_names[i]] = arg
# Create params from kwargs
try:
params = par_model(**kwargs)
except Exception as e:
raise ValueError(f'Invalid parameters for {act_name}: {e}') from e
# Special validation for done() - enforce minimal code cell
if act_name == 'done':
consecutive_failures = namespace.get('_consecutive_errors')
if consecutive_failures and consecutive_failures > 3:
pass
else:
# Check if there are multiple Python blocks in this response
all_blocks = namespace.get('_all_code_blocks', {})
python_blocks = [k for k in sorted(all_blocks.keys()) if k.startswith('python_')]
if len(python_blocks) > 1:
msg = (
'done() should be the ONLY code block in the response.\n'
'You have multiple Python blocks in this response. Consider calling done() in a separate response '
'Now verify the last output and if it satisfies the task, call done(), else continue working.'
)
print(msg)
# Get the current cell code from namespace (injected by service.py before execution)
current_code = namespace.get('_current_cell_code')
if current_code and isinstance(current_code, str):
# Count non-empty, non-comment lines
lines = [line.strip() for line in current_code.strip().split('\n')]
code_lines = [line for line in lines if line and not line.startswith('#')]
# Check if the line above await done() contains an if block
done_line_index = -1
for i, line in enumerate(reversed(code_lines)):
if 'await done()' in line or 'await done(' in line:
done_line_index = len(code_lines) - 1 - i
break
has_if_above = False
has_else_above = False
has_elif_above = False
if done_line_index > 0:
line_above = code_lines[done_line_index - 1]
has_if_above = line_above.strip().startswith('if ') and line_above.strip().endswith(':')
has_else_above = line_above.strip().startswith('else:')
has_elif_above = line_above.strip().startswith('elif ')
if has_if_above or has_else_above or has_elif_above:
msg = (
'done() should be called individually after verifying the result from any logic.\n'
'Consider validating your output first, THEN call done() in a final step without if/else/elif blocks only if the task is truly complete.'
)
logger.error(msg)
print(msg)
raise RuntimeError(msg)
# Build special context
special_context = {
'browser_session': browser_session,
'page_extraction_llm': page_extraction_llm,
'available_file_paths': available_file_paths,
'has_sensitive_data': False, # Can be handled separately if needed
'file_system': file_system,
}
# Execute the action
result = await act_func(params=params, **special_context)
# For code-use mode, we want to return the result directly
# not wrapped in ActionResult
if hasattr(result, 'extracted_content'):
# Special handling for done action - mark task as complete
if act_name == 'done' and hasattr(result, 'is_done') and result.is_done:
namespace['_task_done'] = True
# Store the extracted content as the final result
if result.extracted_content:
namespace['_task_result'] = result.extracted_content
# Store the self-reported success status
if hasattr(result, 'success'):
namespace['_task_success'] = result.success
# If there's extracted content, return it
if result.extracted_content:
return result.extracted_content
# If there's an error, raise it
if result.error:
raise RuntimeError(result.error)
# Otherwise return None
return None
return result
return action_wrapper
# Rename 'input' to 'input_text' to avoid shadowing Python's built-in input()
namespace_action_name = 'input_text' if action_name == 'input' else action_name
# Add the wrapper to the namespace
namespace[namespace_action_name] = make_action_wrapper(action_name, param_model, action_function)
return namespace
def get_namespace_documentation(namespace: dict[str, Any]) -> str:
"""
Generate documentation for all available functions in the namespace.
Args:
namespace: The namespace dictionary
Returns:
Markdown-formatted documentation string
"""
docs = ['# Available Functions\n']
# Document each function
for name, obj in sorted(namespace.items()):
if callable(obj) and not name.startswith('_'):
# Get function signature and docstring
if hasattr(obj, '__doc__') and obj.__doc__:
docs.append(f'## {name}\n')
docs.append(f'{obj.__doc__}\n')
return '\n'.join(docs)

View File

@@ -1,276 +0,0 @@
"""Export code-use session to Jupyter notebook format."""
import json
import re
from pathlib import Path
from browser_use.code_use.service import CodeAgent
from .views import CellType, NotebookExport
def export_to_ipynb(agent: CodeAgent, output_path: str | Path) -> Path:
"""
Export a NotebookSession to a Jupyter notebook (.ipynb) file.
Now includes JavaScript code blocks that were stored in the namespace.
Args:
session: The NotebookSession to export
output_path: Path where to save the notebook file
agent: Optional CodeAgent instance to access namespace for JavaScript blocks
Returns:
Path to the saved notebook file
Example:
```python
session = await agent.run()
notebook_path = export_to_ipynb(agent, 'my_automation.ipynb')
print(f'Notebook saved to {notebook_path}')
```
"""
output_path = Path(output_path)
# Create notebook structure
notebook = NotebookExport(
metadata={
'kernelspec': {'display_name': 'Python 3', 'language': 'python', 'name': 'python3'},
'language_info': {
'name': 'python',
'version': '3.11.0',
'mimetype': 'text/x-python',
'codemirror_mode': {'name': 'ipython', 'version': 3},
'pygments_lexer': 'ipython3',
'nbconvert_exporter': 'python',
'file_extension': '.py',
},
}
)
# Add setup cell at the beginning with proper type hints
setup_code = """import asyncio
import json
from typing import Any
from browser_use import BrowserSession
from browser_use.code_use import create_namespace
# Initialize browser and namespace
browser = BrowserSession()
await browser.start()
# Create namespace with all browser control functions
namespace: dict[str, Any] = create_namespace(browser)
# Import all functions into the current namespace
globals().update(namespace)
# Type hints for better IDE support (these are now available globally)
# navigate, click, input, evaluate, search, extract, scroll, done, etc.
print("Browser-use environment initialized!")
print("Available functions: navigate, click, input, evaluate, search, extract, done, etc.")"""
setup_cell = {
'cell_type': 'code',
'metadata': {},
'source': setup_code.split('\n'),
'execution_count': None,
'outputs': [],
}
notebook.cells.append(setup_cell)
# Add JavaScript code blocks as variables FIRST
if hasattr(agent, 'namespace') and agent.namespace:
# Look for JavaScript variables in the namespace
code_block_vars = agent.namespace.get('_code_block_vars', set())
for var_name in sorted(code_block_vars):
var_value = agent.namespace.get(var_name)
if isinstance(var_value, str) and var_value.strip():
# Check if this looks like JavaScript code
# Look for common JS patterns
js_patterns = [
r'function\s+\w+\s*\(',
r'\(\s*function\s*\(\)',
r'=>\s*{',
r'document\.',
r'Array\.from\(',
r'\.querySelector',
r'\.textContent',
r'\.innerHTML',
r'return\s+',
r'console\.log',
r'window\.',
r'\.map\(',
r'\.filter\(',
r'\.forEach\(',
]
is_js = any(re.search(pattern, var_value, re.IGNORECASE) for pattern in js_patterns)
if is_js:
# Create a code cell with the JavaScript variable
js_cell = {
'cell_type': 'code',
'metadata': {},
'source': [f'# JavaScript Code Block: {var_name}\n', f'{var_name} = """{var_value}"""'],
'execution_count': None,
'outputs': [],
}
notebook.cells.append(js_cell)
# Convert cells
python_cell_count = 0
for cell in agent.session.cells:
notebook_cell: dict = {
'cell_type': cell.cell_type.value,
'metadata': {},
'source': cell.source.splitlines(keepends=True),
}
if cell.cell_type == CellType.CODE:
python_cell_count += 1
notebook_cell['execution_count'] = cell.execution_count
notebook_cell['outputs'] = []
# Add output if available
if cell.output:
notebook_cell['outputs'].append(
{
'output_type': 'stream',
'name': 'stdout',
'text': cell.output.split('\n'),
}
)
# Add error if available
if cell.error:
notebook_cell['outputs'].append(
{
'output_type': 'error',
'ename': 'Error',
'evalue': cell.error.split('\n')[0] if cell.error else '',
'traceback': cell.error.split('\n') if cell.error else [],
}
)
# Add browser state as a separate output
if cell.browser_state:
notebook_cell['outputs'].append(
{
'output_type': 'stream',
'name': 'stdout',
'text': [f'Browser State:\n{cell.browser_state}'],
}
)
notebook.cells.append(notebook_cell)
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(notebook.model_dump(), f, indent=2, ensure_ascii=False)
return output_path
def session_to_python_script(agent: CodeAgent) -> str:
"""
Convert a CodeAgent session to a Python script.
Now includes JavaScript code blocks that were stored in the namespace.
Args:
agent: The CodeAgent instance to convert
Returns:
Python script as a string
Example:
```python
await agent.run()
script = session_to_python_script(agent)
print(script)
```
"""
lines = []
lines.append('# Generated from browser-use code-use session\n')
lines.append('import asyncio\n')
lines.append('import json\n')
lines.append('from browser_use import BrowserSession\n')
lines.append('from browser_use.code_use import create_namespace\n\n')
lines.append('async def main():\n')
lines.append('\t# Initialize browser and namespace\n')
lines.append('\tbrowser = BrowserSession()\n')
lines.append('\tawait browser.start()\n\n')
lines.append('\t# Create namespace with all browser control functions\n')
lines.append('\tnamespace = create_namespace(browser)\n\n')
lines.append('\t# Extract functions from namespace for direct access\n')
lines.append('\tnavigate = namespace["navigate"]\n')
lines.append('\tclick = namespace["click"]\n')
lines.append('\tinput_text = namespace["input"]\n')
lines.append('\tevaluate = namespace["evaluate"]\n')
lines.append('\tsearch = namespace["search"]\n')
lines.append('\textract = namespace["extract"]\n')
lines.append('\tscroll = namespace["scroll"]\n')
lines.append('\tdone = namespace["done"]\n')
lines.append('\tgo_back = namespace["go_back"]\n')
lines.append('\twait = namespace["wait"]\n')
lines.append('\tscreenshot = namespace["screenshot"]\n')
lines.append('\tfind_text = namespace["find_text"]\n')
lines.append('\tswitch_tab = namespace["switch"]\n')
lines.append('\tclose_tab = namespace["close"]\n')
lines.append('\tdropdown_options = namespace["dropdown_options"]\n')
lines.append('\tselect_dropdown = namespace["select_dropdown"]\n')
lines.append('\tupload_file = namespace["upload_file"]\n')
lines.append('\tsend_keys = namespace["send_keys"]\n\n')
# Add JavaScript code blocks as variables FIRST
if hasattr(agent, 'namespace') and agent.namespace:
code_block_vars = agent.namespace.get('_code_block_vars', set())
for var_name in sorted(code_block_vars):
var_value = agent.namespace.get(var_name)
if isinstance(var_value, str) and var_value.strip():
# Check if this looks like JavaScript code
js_patterns = [
r'function\s+\w+\s*\(',
r'\(\s*function\s*\(\)',
r'=>\s*{',
r'document\.',
r'Array\.from\(',
r'\.querySelector',
r'\.textContent',
r'\.innerHTML',
r'return\s+',
r'console\.log',
r'window\.',
r'\.map\(',
r'\.filter\(',
r'\.forEach\(',
]
is_js = any(re.search(pattern, var_value, re.IGNORECASE) for pattern in js_patterns)
if is_js:
lines.append(f'\t# JavaScript Code Block: {var_name}\n')
lines.append(f'\t{var_name} = """{var_value}"""\n\n')
for i, cell in enumerate(agent.session.cells):
if cell.cell_type == CellType.CODE:
lines.append(f'\t# Cell {i + 1}\n')
# Indent each line of source
source_lines = cell.source.split('\n')
for line in source_lines:
if line.strip(): # Only add non-empty lines
lines.append(f'\t{line}\n')
lines.append('\n')
lines.append('\tawait browser.stop()\n\n')
lines.append("if __name__ == '__main__':\n")
lines.append('\tasyncio.run(main())\n')
return ''.join(lines)

File diff suppressed because it is too large Load Diff

View File

@@ -1,574 +0,0 @@
# Coding Browser Agent - System Prompt
You are created by browser-use for complex automated browser tasks.
## Core Concept
You execute Python code in a notebook like environment to control a browser and complete tasks.
**Mental Model**: Write one code cell per step → Gets automatically executed → **you receive the new output + * in the next response you write the next code cell → Repeat.
---
## INPUT: What You See
### Browser State Format
- **URL & DOM**: Compressed DOM tree with interactive elements marked as `[i_123]`
- **Loading Status**: Network requests currently pending (automatically filtered for ads/tracking)
- Shows URL, loading duration, and resource type for each pending request
- **Element Markers**:
- `[i_123]` - Interactive elements (buttons, inputs, links)
- `|SHADOW(open/closed)|` - Shadow DOM boundaries (content auto-included)
- `|IFRAME|` or `|FRAME|` - Iframe boundaries (content auto-included)
- `|scroll element|` - Scrollable containers
### Execution Environment
- **Variables persist** across steps (like Jupyter) - NEVER use `global` keyword - thats not needed we do the injection for you.
- **Multiple code blocks in ONE response are COMBINED** - earlier blocks' variables available in later blocks
- **8 consecutive errors = auto-termination**
### Multi-Block Code Support
Non-Python blocks are saved as string variables:
- ````js extract_products` → saved to `extract_products` variable (named blocks)
- ````markdown result_summary` → saved to `result_summary` variable
- ````bash bash_code` → saved to `bash_code` variable
Variable name matches exactly what you write after language name!
**Nested Code Blocks**: If your code contains ``` inside it (e.g., markdown with code blocks), use 4+ backticks:
- `````markdown fix_code` with ``` inside → use 4 backticks to wrap
- ``````python complex_code` with ```` inside → use 5+ backticks to wrap
---
## OUTPUT: How You Respond
### Response Format - Cell-by-Cell Execution
**This is a Jupyter-like notebook environment**: Execute ONE code cell → See output + browser state → Execute next cell.
[1 short sentence about previous step code result and new DOM]
[1 short sentence about next step]
```python
# 1 cell of code here that will be executed
print(results)
```
Stop generating and inspect the output before continuing.
## TOOLS: Available Functions
### 1. Navigation
```python
await navigate('https://example.com')
await asyncio.sleep(1)
```
- **Auto-wait**: System automatically waits 1s if network requests are pending before showing you the state
- Loaded fully? Check URL/DOM and **⏳ Loading** status in next browser state
- If you see pending network requests in the state, consider waiting longer: `await asyncio.sleep(2)`
- In your next browser state after navigation analyse the screenshot: Is data still loading? Do you expect more data? → Wait longer with.
- All previous indices [i_index] become invalid after navigation
**After navigate(), dismiss overlays**:
```js dismiss_overlays
(function(){
const dismissed = [];
['button[id*="accept"]', '[class*="cookie"] button'].forEach(sel => {
document.querySelectorAll(sel).forEach(btn => {
if (btn.offsetParent !== null) {
btn.click();
dismissed.push('cookie');
}
});
});
document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape', keyCode: 27}));
return dismissed.length > 0 ? dismissed : null;
})()
```
```python
dismissed = await evaluate(dismiss_overlays)
if dismissed:
print(f"OK Dismissed: {dismissed}")
```
For web search use duckduckgo.com by default to avoid CAPTCHAS.
If direct navigation is blocked by CAPTCHA or challenge that cannot be solved after one try, pivot to alternative methods: try alternative URLs for the same content, third-party aggregators (user intent has highest priority).
### 2. Interactive Elements
The index is the label inside your browser state [i_index] inside the element you want to interact with. Only use indices from the current state. After page changes these become invalid.
```python
await click(index=456) # accepts only index integer from browser state
await input_text(index=456, text="hello", clear=True) # Clear False to append text
await upload_file(index=789, path="/path/to/file.pdf")
await dropdown_options(index=123)
await select_dropdown(index=123, text="CA") # Text can be the element text or value.
await scroll(down=True, pages=1.0, index=None) # Down=False to scroll up. Pages=10.0 to scroll 10 pages. Use Index to scroll in the container of this element.
await send_keys(keys="Enter") # Use e.g. for Escape, Arrow keys, Page Up, Page Down, Home, End, etc.
await switch(tab_id="a1b2") # Switch to a 4 character tab by id from the browser state.
await close(tab_id="a1b2") # Close a tab by id from the browser state.
await go_back() # Navigate back in the browser history.
```
Indices Work Only once. After page changes (click, navigation, DOM update), ALL indices `[i_*]` become invalid and must be re-queried.
Do not do:
```python
link_indices = [456, 457, 458]
for idx in link_indices:
await click(index=idx) # FAILS - indices stale after first click
```
RIGHT - Option 1 (Extract URLs first):
```python
links = await evaluate('(function(){ return Array.from(document.querySelectorAll("a.product")).map(a => a.href); })()')
for url in links:
await navigate(url)
# extract data
await go_back()
```
### 3. get_selector_from_index(index: int) → str
Get stable CSS selector for element with index `[i_456]`:
```python
import json
selector = await get_selector_from_index(index=456)
print(f"OK Selector: {selector}") # Always print for debugging!
el_text = await evaluate(f'(function(){{ return document.querySelector({json.dumps(selector)}).textContent; }})()')
```
**When to use**:
- Clicking same element type repeatedly (e.g., "Next" button in pagination)
- Loops where DOM changes between iterations
### 4. evaluate(js: str, variables: dict = None) → Python data
Execute JavaScript, returns dict/list/str/number/bool/None.
**ALWAYS use ```js blocks for anything beyond one-liners**:
```js extract_products
(function(){
return Array.from(document.querySelectorAll('.product')).map(p => ({
name: p.querySelector('.name')?.textContent,
price: p.querySelector('.price')?.textContent
}));
})()
```
```python
products = await evaluate(extract_products)
print(f"Found {len(products)} products")
```
**Passing Python variables to JavaScript**:
```js extract_data
(function(params) {
const maxItems = params.max_items || 100;
return Array.from(document.querySelectorAll('.item'))
.slice(0, maxItems)
.map(item => ({name: item.textContent}));
})
```
```python
result = await evaluate(extract_data, variables={'max_items': 50})
```
**Key rules**:
- Wrap in IIFE: `(function(){ ... })()`
- For variables: use `(function(params){ ... })` without final `()`
- NO JavaScript comments (`//` or `/* */`)
- NO backticks (\`) inside code blocks
- Use standard JS (NO jQuery)
- Do optional checks - and print the results to help you debug.
- Avoid complex queries where possible. Do all data processing in python.
- Avoid syntax errors. For more complex data use json.dumps(data).
### 5. done() - MANDATORY FINAL STEP
Final Output with done(text:str, success:bool, files_to_display:list[str] = [])
```python
summary = "Successfully extracted 600 items on 40 pages and saved them to the results.json file."
await done(
text=summary,
success=True,
files_to_display=['results.json', 'data.csv']
)
```
**Rules**:
1. `done()` must be the ONLY statement in this cell/response. In the steps before you must verify the final result.
3. For structured data/code: write to files, use `files_to_display`
4. For short tasks (<5 lines output): print directly in `done(text=...)`, skip file creation
5. NEVER embed JSON/code blocks in markdown templates (breaks `.format()`). Instead use json.dumps(data) or + to concatenate strings.
6. Set `success=False` if task impossible after many many different attempts
---
## HINTS: Common Patterns & Pitfalls
### JavaScript Search > Scrolling
Before scrolling 2+ times, use JS to search entire document:
```js search_document
(function(){
const fullText = document.body.innerText;
return {
found: fullText.includes('Balance Sheet'),
sampleText: fullText.substring(0, 200)
};
})()
```
### Verify Search Results Loaded
After search submission, ALWAYS verify results exist:
```js verify_search_results
(function(){
return document.querySelectorAll("[class*=\\"result\\"]").length;
})()
```
```python
await input_text(index=SEARCH_INPUT, text="query", clear=True)
await send_keys(keys="Enter")
await asyncio.sleep(1)
result_count = await evaluate(verify_search_results)
if result_count == 0:
print("Search failed, trying alternative")
await navigate(f"https://site.com/search?q={query.replace(' ', '+')}")
else:
print(f"Search returned {result_count} results")
```
### Handle Dynamic/Obfuscated Classes
Modern sites use hashed classes (`_30jeq3`). After 2 failures, switch strategy:
In the exploration phase you can combine multiple in parallel with error handling to find the best approach quickly..
**Strategy 1**: Extract by structure/position
```js extract_products_by_structure
(function(){
return Array.from(document.querySelectorAll('.product')).map(p => {
const link = p.querySelector('a[href*="/product/"]');
const priceContainer = p.querySelector('div:nth-child(3)');
return {
name: link?.textContent,
priceText: priceContainer?.textContent
};
});
})()
```
**Strategy 2**: Extract all text, parse in Python with regex
```python
items = await evaluate(extract_products_by_structure)
import re
for item in items:
prices = re.findall(r'[$₹€][\d,]+', item['priceText'])
item['price'] = prices[0] if prices else None
```
**Strategy 3**: Debug by printing structure
```js print_structure
(function(){
const el = document.querySelector('.product');
return {
html: el?.outerHTML.substring(0, 500),
classes: Array.from(el?.querySelectorAll('*') || [])
.map(e => e.className)
.filter(c => c.includes('price'))
};
})()
```
### Pagination: Try URL First
**Priority order**:
1. **Try URL parameters** (1 attempt): `?page=2`, `?p=2`, `?offset=20`, `/page/2/`
2. **If URL fails, search & click the next page button**
### Pre-Extraction Checklist
First verify page is loaded and you set the filters/settings correctly:
```js product_count
(function(){
return document.querySelectorAll(".product").length;
})()
```
```python
print("=== Applying filters ===")
await select_dropdown(index=789, text="Under $100")
await click(index=567) # Apply button
print("OK Filters applied")
filtered_count = await evaluate(product_count)
print(f"OK Page loaded with {filtered_count} products")
```
---
## STRATEGY: Execution Flow
### Phase 1: Exploration
- Navigate to target URL
- Dismiss overlays (cookies, modals)
- Apply all filters/settings BEFORE extraction
- Use JavaScript to search entire document for target content
- Explore DOM structure with various small test extractions in parallel with error handling
- Use try/except and null checks
- Print sub-information to validate approach
### Phase 2: Validation (Execute Cell-by-Cell!)
- Write general extraction function
- Test on small subset (1-5 items) with error handling
- Verify data structure in Python
- Check for missing/null fields
- Print sample data
- If extraction fails 2x, switch strategy
### Phase 3: Batch Processing
- Once strategy validated, increase batch size
- Loop with explicit counters
- Save incrementally to avoid data loss
- Handle pagination (URL first, then buttons)
- Track progress: `print(f"Page {i}: {len(items)} items. Total: {len(all_data)}")`
- Check if it works and then increase the batch size.
### Phase 4: Cleanup & Verification
- Verify all required data collected
- Filter duplicates
- Missing fields / Data? -> change strategy and keep going.
- Format/clean data in Python (NOT JavaScript)
- Write to files (JSON/CSV)
- Print final stats, but not all the data to avoid overwhelming the context.
- Inspect the output and reason if this is exactly the user intent or if the user wants more.
### Phase 5: Done
- Verify task completion
- Call `done()` with summary + `files_to_display`
---
## EXAMPLE: Complete Flow
**Task**: Extract products from paginated e-commerce site, save to JSON
### Step 1: Navigate + Dismiss Overlays
```js page_loaded
(function(){
return document.readyState === 'complete';
})()
```
```python
await navigate('https://example.com/products')
await asyncio.sleep(2)
loaded = await evaluate(page_loaded)
if not loaded:
print("Page not loaded, trying again")
await asyncio.sleep(1)
```
### Receive current browser state after cell execution - analyse it.
### Step 2: Dismiss Modals
```js dismiss_overlays
(function(){
document.querySelectorAll('button[id*="accept"]').forEach(b => b.click());
document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape'}));
return 'dismissed';
})()
```
```python
await evaluate(dismiss_overlays)
```
### Step 3: Apply Filters
```python
await select_dropdown(index=123, text="Under $50")
await click(index=456) # Apply filters button
```
### Step 4: Explore - Test Single Element
```js test_single_element
(function(){
const first = document.querySelector('.product');
return {
html: first?.outerHTML.substring(0, 300),
name: first?.querySelector('.name')?.textContent,
price: first?.querySelector('.price')?.textContent
};
})()
```
```js find_heading_by_text
(function(){
const headings = Array.from(document.querySelectorAll('h2, h3'));
const target = headings.find(h => h.textContent.includes('Full Year 2024'));
return target ? target.textContent : null;
})()
```
```js find_element_by_text_content
(function(){
const elements = Array.from(document.querySelectorAll('dt'));
const locationLabel = elements.find(el => el.textContent.includes('Location'));
const nextSibling = locationLabel?.nextElementSibling;
return nextSibling ? nextSibling.textContent : null;
})()
```
```js get_product_urls
(function(){
return Array.from(document.querySelectorAll('a[href*="product"]').slice(0, 10)).map(a => a.href);
})()
```
```python
# load more
scroll(down=True, pages=3.0)
await asyncio.sleep(0.5)
scroll(down=False, pages=2.5)
try:
list_of_urls = await evaluate(get_product_urls)
print(f"found {len(list_of_urls)} product urls, sample {list_of_urls[0] if list_of_urls else 'no urls found'}")
except Exception as e:
# different strategies
print("Error: No elements found")
try:
test = await evaluate(test_single_element)
print(f"Sample product: {test}")
except Exception as e:
# different strategies
print(f"Error: {e}")
```
### Step 5: Write General Extraction Function
```js extract_products
(function(){
return Array.from(document.querySelectorAll('.product')).map(p => ({
name: p.querySelector('.name')?.textContent?.trim(),
price: p.querySelector('.price')?.textContent?.trim(),
url: p.querySelector('a')?.href
})).filter(p => p.name && p.price);
})()
```
```python
products_page1 = await evaluate(extract_products)
print(f"Extracted {len(products_page1)} products from page 1: {products_page1[0] if products_page1 else 'no products found'}")
```
### Step 6: Test Pagination with URL
```python
await navigate('https://example.com/products?page=2')
await asyncio.sleep(2)
products_page2 = await evaluate(extract_products)
if len(products_page2) > 0:
print("OK URL pagination works!")
```
### Step 7: Loop and Collect All Pages
```python
all_products = []
page_num = 1
while page_num <= 50:
url = f"https://example.com/products?page={page_num}"
await navigate(url)
await asyncio.sleep(3)
items = await evaluate(extract_products)
if len(items) == 0:
print(f"Page {page_num} empty - reached end")
break
all_products.extend(items)
print(f"Page {page_num}: {len(items)} items. Total: {len(all_products)}")
page_num += 1
# if you have to click in the loop use selector and not the interactive index, because they invalidate after navigation.
```
### Step 8: Clean Data & Deduplicate
```python
import re
for product in all_products:
price_str = product['price']
price_clean = re.sub(r'[^0-9.]', '', price_str)
product['price_numeric'] = float(price_clean) if price_clean else None
# deduplicate
all_products = list(set(all_products))
# number of prices
valid_products = [p for p in all_products if p.get('price_numeric')]
print(f"OK {len(valid_products)} valid products with prices")
print(f"OK Cleaned {len(all_products)} products")
print(f"Sample cleaned: {json.dumps(valid_products[0], indent=2) if valid_products else 'no products found'}")
```
### Step 9: Prepare output, write File & verify result
```markdown summary
# Product Extraction Complete
Successfully extracted 100 products from 20 pages.
Full data saved to: products.json.
```
```python
with open('products.json', 'w', encoding='utf-8') as f:
json.dump(valid_products, f, indent=2, ensure_ascii=False)
print(f"OK Wrote products.json ({len(valid_products)} products)")
sample = json.dumps(valid_products[0], indent=2)
# Be careful with escaping and always print before using done.
final_summary = summary + "\nSample:\n" + sample
print(summary)
```
### Stop and inspect the output before continuing.
### If data is missing go back and change the strategy until all data is collected or you reach max steps.
### Step 10: Done in single response (After verifying the previous output)
```python
await done(text=final_summary, success=True, files_to_display=['products.json'])
```
---
## CRITICAL RULES
1. **NO `global` keyword** - Variables persist automatically
2. **No comments** in Python or JavaScript code, write concise code.
3. **Verify results after search** - Check result count > 0
4. **Call done(text, success) in separate step** - After verifying results - else continue
5. **Write structured data to files** - Never embed in markdown
6. Do not use jQuery.
7. Reason about the browser state and what you need to keep in mind on this page. E.g. popups, dynamic content, closed shadow DOM, iframes, scroll to load more...
8. If selectors fail, simply try different once. Print many and then try different strategies.
---
## Available Libraries
**Pre-imported**: `json`, `asyncio`, `csv`, `re`, `datetime`, `Path`, `requests`
## User Task
Analyze user intent and complete the task successfully. Do not stop until completed.
Respond in the format the user requested.

View File

@@ -1,150 +0,0 @@
"""Utility functions for code-use agent."""
import re
def truncate_message_content(content: str, max_length: int = 10000) -> str:
"""Truncate message content to max_length characters for history."""
if len(content) <= max_length:
return content
# Truncate and add marker
return content[:max_length] + f'\n\n[... truncated {len(content) - max_length} characters for history]'
def detect_token_limit_issue(
completion: str,
completion_tokens: int | None,
max_tokens: int | None,
stop_reason: str | None,
) -> tuple[bool, str | None]:
"""
Detect if the LLM response hit token limits or is repetitive garbage.
Returns: (is_problematic, error_message)
"""
# Check 1: Stop reason indicates max_tokens
if stop_reason == 'max_tokens':
return True, f'Response terminated due to max_tokens limit (stop_reason: {stop_reason})'
# Check 2: Used 90%+ of max_tokens (if we have both values)
if completion_tokens is not None and max_tokens is not None and max_tokens > 0:
usage_ratio = completion_tokens / max_tokens
if usage_ratio >= 0.9:
return True, f'Response used {usage_ratio:.1%} of max_tokens ({completion_tokens}/{max_tokens})'
# Check 3: Last 6 characters repeat 40+ times (repetitive garbage)
if len(completion) >= 6:
last_6 = completion[-6:]
repetition_count = completion.count(last_6)
if repetition_count >= 40:
return True, f'Repetitive output detected: last 6 chars "{last_6}" appears {repetition_count} times'
return False, None
def extract_url_from_task(task: str) -> str | None:
"""Extract URL from task string using naive pattern matching."""
# Remove email addresses from task before looking for URLs
task_without_emails = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', task)
# Look for common URL patterns
patterns = [
r'https?://[^\s<>"\']+', # Full URLs with http/https
r'(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?', # Domain names with subdomains and optional paths
]
found_urls = []
for pattern in patterns:
matches = re.finditer(pattern, task_without_emails)
for match in matches:
url = match.group(0)
# Remove trailing punctuation that's not part of URLs
url = re.sub(r'[.,;:!?()\[\]]+$', '', url)
# Add https:// if missing
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
found_urls.append(url)
unique_urls = list(set(found_urls))
# If multiple URLs found, skip auto-navigation to avoid ambiguity
if len(unique_urls) > 1:
return None
# If exactly one URL found, return it
if len(unique_urls) == 1:
return unique_urls[0]
return None
def extract_code_blocks(text: str) -> dict[str, str]:
"""Extract all code blocks from markdown response.
Supports:
- ```python, ```js, ```javascript, ```bash, ```markdown, ```md
- Named blocks: ```js variable_name → saved as 'variable_name' in namespace
- Nested blocks: Use 4+ backticks for outer block when inner content has 3 backticks
Returns dict mapping block_name -> content
Note: Python blocks are NO LONGER COMBINED. Each python block executes separately
to allow sequential execution with JS/bash blocks in between.
"""
# Pattern to match code blocks with language identifier and optional variable name
# Matches: ```lang\n or ```lang varname\n or ````+lang\n (4+ backticks for nested blocks)
# Uses non-greedy matching and backreferences to match opening/closing backticks
pattern = r'(`{3,})(\w+)(?:\s+(\w+))?\n(.*?)\1(?:\n|$)'
matches = re.findall(pattern, text, re.DOTALL)
blocks: dict[str, str] = {}
python_block_counter = 0
for backticks, lang, var_name, content in matches:
lang = lang.lower()
# Normalize language names
if lang in ('javascript', 'js'):
lang_normalized = 'js'
elif lang in ('markdown', 'md'):
lang_normalized = 'markdown'
elif lang in ('sh', 'shell'):
lang_normalized = 'bash'
elif lang == 'python':
lang_normalized = 'python'
else:
# Unknown language, skip
continue
# Only process supported types
if lang_normalized in ('python', 'js', 'bash', 'markdown'):
content = content.rstrip() # Only strip trailing whitespace, preserve leading for indentation
if content:
# Determine the key to use
if var_name:
# Named block - use the variable name
block_key = var_name
blocks[block_key] = content
elif lang_normalized == 'python':
# Unnamed Python blocks - give each a unique key to preserve order
block_key = f'python_{python_block_counter}'
blocks[block_key] = content
python_block_counter += 1
else:
# Other unnamed blocks (js, bash, markdown) - keep last one only
blocks[lang_normalized] = content
# If we have multiple python blocks, mark the first one as 'python' for backward compat
if python_block_counter > 0:
blocks['python'] = blocks['python_0']
# Fallback: if no python block but there's generic ``` block, treat as python
if python_block_counter == 0 and 'python' not in blocks:
generic_pattern = r'```\n(.*?)```'
generic_matches = re.findall(generic_pattern, text, re.DOTALL)
if generic_matches:
combined = '\n\n'.join(m.strip() for m in generic_matches if m.strip())
if combined:
blocks['python'] = combined
return blocks

View File

@@ -1,403 +0,0 @@
"""Data models for code-use mode."""
from __future__ import annotations
import json
from enum import Enum
from pathlib import Path
from typing import Any
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
from uuid_extensions import uuid7str
from browser_use.tokens.views import UsageSummary
class CellType(str, Enum):
"""Type of notebook cell."""
CODE = 'code'
MARKDOWN = 'markdown'
class ExecutionStatus(str, Enum):
"""Execution status of a cell."""
PENDING = 'pending'
RUNNING = 'running'
SUCCESS = 'success'
ERROR = 'error'
class CodeCell(BaseModel):
"""Represents a code cell in the notebook-like execution."""
model_config = ConfigDict(extra='forbid')
id: str = Field(default_factory=uuid7str)
cell_type: CellType = CellType.CODE
source: str = Field(description='The code to execute')
output: str | None = Field(default=None, description='The output of the code execution')
execution_count: int | None = Field(default=None, description='The execution count')
status: ExecutionStatus = Field(default=ExecutionStatus.PENDING)
error: str | None = Field(default=None, description='Error message if execution failed')
browser_state: str | None = Field(default=None, description='Browser state after execution')
class NotebookSession(BaseModel):
"""Represents a notebook-like session."""
model_config = ConfigDict(extra='forbid')
id: str = Field(default_factory=uuid7str)
cells: list[CodeCell] = Field(default_factory=list)
current_execution_count: int = Field(default=0)
namespace: dict[str, Any] = Field(default_factory=dict, description='Current namespace state')
_complete_history: list[CodeAgentHistory] = PrivateAttr(default_factory=list)
_usage_summary: UsageSummary | None = PrivateAttr(default=None)
def add_cell(self, source: str) -> CodeCell:
"""Add a new code cell to the session."""
cell = CodeCell(source=source)
self.cells.append(cell)
return cell
def get_cell(self, cell_id: str) -> CodeCell | None:
"""Get a cell by ID."""
for cell in self.cells:
if cell.id == cell_id:
return cell
return None
def get_latest_cell(self) -> CodeCell | None:
"""Get the most recently added cell."""
if self.cells:
return self.cells[-1]
return None
def increment_execution_count(self) -> int:
"""Increment and return the execution count."""
self.current_execution_count += 1
return self.current_execution_count
@property
def history(self) -> CodeAgentHistoryList:
"""Get the history as an AgentHistoryList-compatible object."""
return CodeAgentHistoryList(self._complete_history, self._usage_summary)
class NotebookExport(BaseModel):
"""Export format for Jupyter notebook."""
model_config = ConfigDict(extra='forbid')
nbformat: int = Field(default=4)
nbformat_minor: int = Field(default=5)
metadata: dict[str, Any] = Field(default_factory=dict)
cells: list[dict[str, Any]] = Field(default_factory=list)
class CodeAgentModelOutput(BaseModel):
"""Model output for CodeAgent - contains the code and full LLM response."""
model_config = ConfigDict(extra='forbid')
model_output: str = Field(description='The extracted code from the LLM response')
full_response: str = Field(description='The complete LLM response including any text/reasoning')
class CodeAgentResult(BaseModel):
"""Result of executing a code cell in CodeAgent."""
model_config = ConfigDict(extra='forbid')
extracted_content: str | None = Field(default=None, description='Output from code execution')
error: str | None = Field(default=None, description='Error message if execution failed')
is_done: bool = Field(default=False, description='Whether task is marked as done')
success: bool | None = Field(default=None, description='Self-reported success from done() call')
class CodeAgentState(BaseModel):
"""State information for a CodeAgent step."""
model_config = ConfigDict(extra='forbid', arbitrary_types_allowed=True)
url: str | None = Field(default=None, description='Current page URL')
title: str | None = Field(default=None, description='Current page title')
screenshot_path: str | None = Field(default=None, description='Path to screenshot file')
def get_screenshot(self) -> str | None:
"""Load screenshot from disk and return as base64 string."""
if not self.screenshot_path:
return None
import base64
from pathlib import Path
path_obj = Path(self.screenshot_path)
if not path_obj.exists():
return None
try:
with open(path_obj, 'rb') as f:
screenshot_data = f.read()
return base64.b64encode(screenshot_data).decode('utf-8')
except Exception:
return None
class CodeAgentStepMetadata(BaseModel):
"""Metadata for a single CodeAgent step including timing and token information."""
model_config = ConfigDict(extra='forbid')
input_tokens: int | None = Field(default=None, description='Number of input tokens used')
output_tokens: int | None = Field(default=None, description='Number of output tokens used')
step_start_time: float = Field(description='Step start timestamp (Unix time)')
step_end_time: float = Field(description='Step end timestamp (Unix time)')
@property
def duration_seconds(self) -> float:
"""Calculate step duration in seconds."""
return self.step_end_time - self.step_start_time
class CodeAgentHistory(BaseModel):
"""History item for CodeAgent actions."""
model_config = ConfigDict(extra='forbid', arbitrary_types_allowed=True)
model_output: CodeAgentModelOutput | None = Field(default=None, description='LLM output for this step')
result: list[CodeAgentResult] = Field(default_factory=list, description='Results from code execution')
state: CodeAgentState = Field(description='Browser state at this step')
metadata: CodeAgentStepMetadata | None = Field(default=None, description='Step timing and token metadata')
screenshot_path: str | None = Field(default=None, description='Legacy field for screenshot path')
def model_dump(self, **kwargs) -> dict[str, Any]:
"""Custom serialization for CodeAgentHistory."""
return {
'model_output': self.model_output.model_dump() if self.model_output else None,
'result': [r.model_dump() for r in self.result],
'state': self.state.model_dump(),
'metadata': self.metadata.model_dump() if self.metadata else None,
'screenshot_path': self.screenshot_path,
}
class CodeAgentHistoryList:
"""Compatibility wrapper for CodeAgentHistory that provides AgentHistoryList-like API."""
def __init__(self, complete_history: list[CodeAgentHistory], usage_summary: UsageSummary | None) -> None:
"""Initialize with CodeAgent history data."""
self._complete_history = complete_history
self._usage_summary = usage_summary
@property
def history(self) -> list[CodeAgentHistory]:
"""Get the raw history list."""
return self._complete_history
@property
def usage(self) -> UsageSummary | None:
"""Get the usage summary."""
return self._usage_summary
def __len__(self) -> int:
"""Return the number of history items."""
return len(self._complete_history)
def __str__(self) -> str:
"""Representation of the CodeAgentHistoryList object."""
return f'CodeAgentHistoryList(steps={len(self._complete_history)}, action_results={len(self.action_results())})'
def __repr__(self) -> str:
"""Representation of the CodeAgentHistoryList object."""
return self.__str__()
def final_result(self) -> None | str:
"""Final result from history."""
if self._complete_history and self._complete_history[-1].result:
return self._complete_history[-1].result[-1].extracted_content
return None
def is_done(self) -> bool:
"""Check if the agent is done."""
if self._complete_history and len(self._complete_history[-1].result) > 0:
last_result = self._complete_history[-1].result[-1]
return last_result.is_done is True
return False
def is_successful(self) -> bool | None:
"""Check if the agent completed successfully."""
if self._complete_history and len(self._complete_history[-1].result) > 0:
last_result = self._complete_history[-1].result[-1]
if last_result.is_done is True:
return last_result.success
return None
def errors(self) -> list[str | None]:
"""Get all errors from history, with None for steps without errors."""
errors = []
for h in self._complete_history:
step_errors = [r.error for r in h.result if r.error]
# each step can have only one error
errors.append(step_errors[0] if step_errors else None)
return errors
def has_errors(self) -> bool:
"""Check if the agent has any non-None errors."""
return any(error is not None for error in self.errors())
def urls(self) -> list[str | None]:
"""Get all URLs from history."""
return [h.state.url if h.state.url is not None else None for h in self._complete_history]
def screenshot_paths(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
"""Get all screenshot paths from history."""
if n_last == 0:
return []
if n_last is None:
if return_none_if_not_screenshot:
return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self._complete_history]
else:
return [h.state.screenshot_path for h in self._complete_history if h.state.screenshot_path is not None]
else:
if return_none_if_not_screenshot:
return [
h.state.screenshot_path if h.state.screenshot_path is not None else None
for h in self._complete_history[-n_last:]
]
else:
return [h.state.screenshot_path for h in self._complete_history[-n_last:] if h.state.screenshot_path is not None]
def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
"""Get all screenshots from history as base64 strings."""
if n_last == 0:
return []
history_items = self._complete_history if n_last is None else self._complete_history[-n_last:]
screenshots = []
for item in history_items:
screenshot_b64 = item.state.get_screenshot()
if screenshot_b64:
screenshots.append(screenshot_b64)
else:
if return_none_if_not_screenshot:
screenshots.append(None)
return screenshots
def action_results(self) -> list[CodeAgentResult]:
"""Get all results from history."""
results = []
for h in self._complete_history:
results.extend([r for r in h.result if r])
return results
def extracted_content(self) -> list[str]:
"""Get all extracted content from history."""
content = []
for h in self._complete_history:
content.extend([r.extracted_content for r in h.result if r.extracted_content])
return content
def number_of_steps(self) -> int:
"""Get the number of steps in the history."""
return len(self._complete_history)
def total_duration_seconds(self) -> float:
"""Get total duration of all steps in seconds."""
total = 0.0
for h in self._complete_history:
if h.metadata:
total += h.metadata.duration_seconds
return total
def last_action(self) -> None | dict:
"""Last action in history - returns the last code execution."""
if self._complete_history and self._complete_history[-1].model_output:
return {
'execute_code': {
'code': self._complete_history[-1].model_output.model_output,
'full_response': self._complete_history[-1].model_output.full_response,
}
}
return None
def action_names(self) -> list[str]:
"""Get all action names from history - returns 'execute_code' for each code execution."""
action_names = []
for action in self.model_actions():
actions = list(action.keys())
if actions:
action_names.append(actions[0])
return action_names
def model_thoughts(self) -> list[Any]:
"""Get all thoughts from history - returns model_output for CodeAgent."""
return [h.model_output for h in self._complete_history if h.model_output]
def model_outputs(self) -> list[CodeAgentModelOutput]:
"""Get all model outputs from history."""
return [h.model_output for h in self._complete_history if h.model_output]
def model_actions(self) -> list[dict]:
"""Get all actions from history - returns code execution actions with their code."""
actions = []
for h in self._complete_history:
if h.model_output:
# Create one action dict per result (code execution)
for _ in h.result:
action_dict = {
'execute_code': {
'code': h.model_output.model_output,
'full_response': h.model_output.full_response,
}
}
actions.append(action_dict)
return actions
def action_history(self) -> list[list[dict]]:
"""Get truncated action history grouped by step."""
step_outputs = []
for h in self._complete_history:
step_actions = []
if h.model_output:
for result in h.result:
action_dict = {
'execute_code': {
'code': h.model_output.model_output,
},
'result': {
'extracted_content': result.extracted_content,
'is_done': result.is_done,
'success': result.success,
'error': result.error,
},
}
step_actions.append(action_dict)
step_outputs.append(step_actions)
return step_outputs
def model_actions_filtered(self, include: list[str] | None = None) -> list[dict]:
"""Get all model actions from history filtered - returns empty for CodeAgent."""
return []
def add_item(self, history_item: CodeAgentHistory) -> None:
"""Add a history item to the list."""
self._complete_history.append(history_item)
def model_dump(self, **kwargs) -> dict[str, Any]:
"""Custom serialization for CodeAgentHistoryList."""
return {
'history': [h.model_dump(**kwargs) for h in self._complete_history],
'usage': self._usage_summary.model_dump() if self._usage_summary else None,
}
def save_to_file(self, filepath: str | Path, sensitive_data: dict[str, str | dict[str, str]] | None = None) -> None:
"""Save history to JSON file."""
try:
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
data = self.model_dump()
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
except Exception as e:
raise e

View File

@@ -76,6 +76,13 @@ class OldConfig:
raise AssertionError('BROWSER_USE_CLOUD_UI_URL must be a valid URL if set')
return url
@property
def BROWSER_USE_MODEL_PRICING_URL(self) -> str:
url = os.getenv('BROWSER_USE_MODEL_PRICING_URL', '')
if url and '://' not in url:
raise AssertionError('BROWSER_USE_MODEL_PRICING_URL must be a valid URL if set')
return url
# Path configuration
@property
def XDG_CACHE_HOME(self) -> Path:
@@ -195,6 +202,7 @@ class FlatEnvConfig(BaseSettings):
BROWSER_USE_CLOUD_SYNC: bool | None = Field(default=None)
BROWSER_USE_CLOUD_API_URL: str = Field(default='https://api.browser-use.com')
BROWSER_USE_CLOUD_UI_URL: str = Field(default='')
BROWSER_USE_MODEL_PRICING_URL: str = Field(default='')
# Path configuration
XDG_CACHE_HOME: str = Field(default='~/.cache')

View File

@@ -9,7 +9,6 @@ from cdp_use.cdp.domsnapshot.commands import CaptureSnapshotReturns
from cdp_use.cdp.domsnapshot.types import (
LayoutTreeSnapshot,
NodeTreeSnapshot,
RareBooleanData,
)
from browser_use.dom.views import DOMRect, EnhancedSnapshotNode
@@ -30,9 +29,9 @@ REQUIRED_COMPUTED_STYLES = [
]
def _parse_rare_boolean_data(rare_data: RareBooleanData, index: int) -> bool | None:
"""Parse rare boolean data from snapshot - returns True if index is in the rare data."""
return index in rare_data['index']
def _parse_rare_boolean_data(rare_data_set: set[int], index: int) -> bool | None:
"""Parse rare boolean data from snapshot - returns True if index is in the rare data set."""
return index in rare_data_set
def _parse_computed_styles(strings: list[str], style_indices: list[int]) -> dict[str, str]:
@@ -85,11 +84,18 @@ def build_snapshot_lookup(
if node_index not in layout_index_map: # Only store first occurrence
layout_index_map[node_index] = layout_idx
# Pre-convert rare boolean data from list to set for O(1) lookups.
# The raw CDP data uses List[int] which makes `index in list` O(n).
# Called once per node, this was O(n²) total — the #1 bottleneck.
# At 20k elements: 5,925ms (list) → 2ms (set) = 3,000x speedup.
has_clickable_data = 'isClickable' in nodes
is_clickable_set: set[int] = set(nodes['isClickable']['index']) if has_clickable_data else set()
# Build snapshot lookup for each backend node id
for backend_node_id, snapshot_index in backend_node_to_snapshot_index.items():
is_clickable = None
if 'isClickable' in nodes:
is_clickable = _parse_rare_boolean_data(nodes['isClickable'], snapshot_index)
if has_clickable_data:
is_clickable = _parse_rare_boolean_data(is_clickable_set, snapshot_index)
# Find corresponding layout node
cursor_style = None

View File

@@ -24,6 +24,7 @@ async def extract_clean_markdown(
dom_service: DomService | None = None,
target_id: str | None = None,
extract_links: bool = False,
extract_images: bool = False,
) -> tuple[str, dict[str, Any]]:
"""Extract clean markdown from browser content using enhanced DOM tree.
@@ -35,6 +36,7 @@ async def extract_clean_markdown(
dom_service: DOM service instance (page actor path)
target_id: Target ID for the page (required when using dom_service)
extract_links: Whether to preserve links in markdown
extract_images: Whether to preserve inline image src URLs in markdown
Returns:
tuple: (clean_markdown_content, content_statistics)
@@ -68,6 +70,9 @@ async def extract_clean_markdown(
# Use markdownify for clean markdown conversion
from markdownify import markdownify as md
# 'td', 'th', and headings are the only elements where markdownify sets the _inline context,
# which causes img elements to be stripped to just alt text when keep_inline_images_in=[]
_keep_inline_images_in = ['td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] if extract_images else []
content = md(
page_html,
heading_style='ATX', # Use # style headings
@@ -79,7 +84,7 @@ async def extract_clean_markdown(
escape_misc=False, # Don't escape other characters (cleaner output)
autolinks=False, # Don't convert URLs to <> format
default_title=False, # Don't add default title attributes
keep_inline_images_in=[], # Don't keep inline images in any tags (we already filter base64 in HTML)
keep_inline_images_in=_keep_inline_images_in, # Include image src URLs when extract_images=True
)
initial_markdown_length = len(content)

View File

@@ -1,287 +0,0 @@
# @file purpose: Ultra-compact serializer optimized for code-use agents
# Focuses on minimal token usage while preserving essential interactive context
from browser_use.dom.utils import cap_text_length
from browser_use.dom.views import (
EnhancedDOMTreeNode,
NodeType,
SimplifiedNode,
)
# Minimal but sufficient attribute list for code agents
CODE_USE_KEY_ATTRIBUTES = [
'id', # Essential for element selection
'name', # For form inputs
'type', # For input types
'placeholder', # For empty inputs
'aria-label', # For buttons without text
'value', # Current values
'alt', # For images
'class', # Keep top 2 classes for common selectors
]
# Interactive elements agent can use
INTERACTIVE_ELEMENTS = {
'a',
'button',
'input',
'textarea',
'select',
'form',
}
# Semantic structure elements - expanded to include more content containers
SEMANTIC_STRUCTURE = {
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'nav',
'main',
'header',
'footer',
'article',
'section',
'p', # Paragraphs often contain prices and product info
'span', # Spans often contain prices and labels
'div', # Divs with useful attributes (id/class) should be shown
'ul',
'ol',
'li',
'label',
'img',
}
class DOMCodeAgentSerializer:
"""Optimized DOM serializer for code-use agents - balances token efficiency with context."""
@staticmethod
def serialize_tree(node: SimplifiedNode | None, include_attributes: list[str], depth: int = 0) -> str:
"""
Serialize DOM tree with smart token optimization.
Strategy:
- Keep top 2 CSS classes for querySelector compatibility
- Show div/span/p elements with useful attributes or text
- Show all interactive + semantic elements
- Inline text up to 80 chars for better context
"""
if not node:
return ''
# Skip excluded/hidden nodes
if hasattr(node, 'excluded_by_parent') and node.excluded_by_parent:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
if not node.should_display:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
formatted_text = []
depth_str = ' ' * depth # Use 2 spaces instead of tabs for compactness
if node.original_node.node_type == NodeType.ELEMENT_NODE:
tag = node.original_node.tag_name.lower()
is_visible = node.original_node.snapshot_node and node.original_node.is_visible
# Skip invisible (except iframes)
if not is_visible and tag not in ['iframe', 'frame']:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Special handling for iframes
if tag in ['iframe', 'frame']:
return DOMCodeAgentSerializer._serialize_iframe(node, include_attributes, depth)
# Build minimal attributes
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
# Decide if element should be shown
is_interactive = tag in INTERACTIVE_ELEMENTS
is_semantic = tag in SEMANTIC_STRUCTURE
has_useful_attrs = bool(attributes_str)
has_text = DOMCodeAgentSerializer._has_direct_text(node)
# Skip non-semantic, non-interactive containers without attributes
if not is_interactive and not is_semantic and not has_useful_attrs and not has_text:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Collapse pointless wrappers
if tag in {'div', 'span'} and not has_useful_attrs and not has_text and len(node.children) == 1:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Build element
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
# Inline text
inline_text = DOMCodeAgentSerializer._get_inline_text(node)
if inline_text:
line += f'>{inline_text}'
else:
line += '>'
formatted_text.append(line)
# Children (only if no inline text)
if node.children and not inline_text:
children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
if children_text:
formatted_text.append(children_text)
elif node.original_node.node_type == NodeType.TEXT_NODE:
# Handled inline with parent
pass
elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM - minimal marker
if node.children:
formatted_text.append(f'{depth_str}#shadow')
children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
if children_text:
formatted_text.append(children_text)
return '\n'.join(formatted_text)
@staticmethod
def _serialize_children(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
"""Serialize children."""
children_output = []
for child in node.children:
child_text = DOMCodeAgentSerializer.serialize_tree(child, include_attributes, depth)
if child_text:
children_output.append(child_text)
return '\n'.join(children_output)
@staticmethod
def _build_minimal_attributes(node: EnhancedDOMTreeNode) -> str:
"""Build minimal but useful attributes - keep top 2 classes for selectors."""
attrs = []
if node.attributes:
for attr in CODE_USE_KEY_ATTRIBUTES:
if attr in node.attributes:
value = str(node.attributes[attr]).strip()
if value:
# Special handling for class - keep only first 2 classes
if attr == 'class':
classes = value.split()[:2]
value = ' '.join(classes)
# Cap at 25 chars
value = cap_text_length(value, 25)
attrs.append(f'{attr}="{value}"')
return ' '.join(attrs)
@staticmethod
def _has_direct_text(node: SimplifiedNode) -> bool:
"""Check if node has direct text children."""
for child in node.children:
if child.original_node.node_type == NodeType.TEXT_NODE:
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
if len(text) > 1:
return True
return False
@staticmethod
def _get_inline_text(node: SimplifiedNode) -> str:
"""Get inline text (max 80 chars for better context)."""
text_parts = []
for child in node.children:
if child.original_node.node_type == NodeType.TEXT_NODE:
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
if text and len(text) > 1:
text_parts.append(text)
if not text_parts:
return ''
combined = ' '.join(text_parts)
return cap_text_length(combined, 40)
@staticmethod
def _serialize_iframe(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
"""Handle iframe minimally."""
formatted_text = []
depth_str = ' ' * depth
tag = node.original_node.tag_name.lower()
# Minimal iframe marker
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
line += '>'
formatted_text.append(line)
# Iframe content
if node.original_node.content_document:
formatted_text.append(f'{depth_str} #iframe-content')
# Find and serialize body content only
for child_node in node.original_node.content_document.children_nodes or []:
if child_node.tag_name.lower() == 'html':
for html_child in child_node.children:
if html_child.tag_name.lower() == 'body':
for body_child in html_child.children:
DOMCodeAgentSerializer._serialize_document_node(
body_child, formatted_text, include_attributes, depth + 2
)
break
return '\n'.join(formatted_text)
@staticmethod
def _serialize_document_node(
dom_node: EnhancedDOMTreeNode, output: list[str], include_attributes: list[str], depth: int
) -> None:
"""Serialize document node without SimplifiedNode wrapper."""
depth_str = ' ' * depth
if dom_node.node_type == NodeType.ELEMENT_NODE:
tag = dom_node.tag_name.lower()
# Skip invisible
is_visible = dom_node.snapshot_node and dom_node.is_visible
if not is_visible:
return
# Check if worth showing
is_interactive = tag in INTERACTIVE_ELEMENTS
is_semantic = tag in SEMANTIC_STRUCTURE
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(dom_node)
if not is_interactive and not is_semantic and not attributes_str:
# Skip but process children
for child in dom_node.children:
DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth)
return
# Build element
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
# Get text
text_parts = []
for child in dom_node.children:
if child.node_type == NodeType.TEXT_NODE and child.node_value:
text = child.node_value.strip()
if text and len(text) > 1:
text_parts.append(text)
if text_parts:
combined = ' '.join(text_parts)
line += f'>{cap_text_length(combined, 25)}'
else:
line += '>'
output.append(line)
# Process non-text children
for child in dom_node.children:
if child.node_type != NodeType.TEXT_NODE:
DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth + 1)

View File

@@ -36,10 +36,21 @@ class RectUnionPure:
"""
Maintains a *disjoint* set of rectangles.
No external dependencies - fine for a few thousand rectangles.
A safety cap (_MAX_RECTS) prevents exponential explosion on pages with
many overlapping translucent layers. Once the cap is hit, contains()
conservatively returns False (i.e. nothing is hidden), preserving
correctness at the cost of less aggressive paint-order filtering.
"""
__slots__ = ('_rects',)
# Safety cap: with complex overlapping layers, each add() can fragment
# existing rects into up to 4 pieces each. On heavy pages (20k+ elements)
# this can cause exponential growth. 5000 is generous enough for normal
# pages but prevents runaway memory/CPU.
_MAX_RECTS = 5000
def __init__(self):
self._rects: list[Rect] = []
@@ -101,6 +112,10 @@ class RectUnionPure:
Insert r unless it is already covered.
Returns True if the union grew.
"""
# Safety cap: stop accepting new rects to prevent exponential explosion
if len(self._rects) >= self._MAX_RECTS:
return False
if self.contains(r):
return False

View File

@@ -1175,11 +1175,24 @@ class DOMTreeSerializer:
attributes_to_include['placeholder'] = 'mm/dd/yyyy'
attributes_to_include['format'] = 'mm/dd/yyyy'
# Never include values from password fields - they contain secrets that must not
# leak into DOM snapshots sent to the LLM, where prompt injection could exfiltrate them.
is_password_field = (
node.tag_name
and node.tag_name.lower() == 'input'
and node.attributes
and node.attributes.get('type', '').lower() == 'password'
)
# Include accessibility properties
if node.ax_node and node.ax_node.properties:
# Properties that carry field values - must be excluded for password fields
value_properties = {'value', 'valuetext'}
for prop in node.ax_node.properties:
try:
if prop.name in include_attributes and prop.value is not None:
if is_password_field and prop.name in value_properties:
continue
# Convert boolean to lowercase string, keep others as-is
if isinstance(prop.value, bool):
attributes_to_include[prop.name] = str(prop.value).lower()
@@ -1193,8 +1206,10 @@ class DOMTreeSerializer:
# Special handling for form elements - ensure current value is shown
# For text inputs, textareas, and selects, prioritize showing the current value from AX tree
if node.tag_name and node.tag_name.lower() in ['input', 'textarea', 'select']:
if is_password_field:
attributes_to_include.pop('value', None)
# ALWAYS check AX tree - it reflects actual typed value, DOM attribute may not update
if node.ax_node and node.ax_node.properties:
elif node.ax_node and node.ax_node.properties:
for prop in node.ax_node.properties:
# Try valuetext first (human-readable display value)
if prop.name == 'valuetext' and prop.value:

View File

@@ -427,6 +427,10 @@ class DomService:
iframe_scroll_ms = (time.time() - start_iframe_scroll) * 1000
# Detect elements with JavaScript click event listeners (without mutating DOM)
# On heavy pages (>10k elements) the querySelectorAll('*') + getEventListeners()
# loop plus per-element DOM.describeNode CDP calls can take 10s+.
# The JS expression below bails out early if the page is too heavy.
# Elements are still detected via the accessibility tree and ClickableElementDetector.
start_js_listener_detection = time.time()
js_click_listener_backend_ids: set[int] = set()
try:
@@ -440,9 +444,15 @@ class DomService:
return null;
}
const elementsWithListeners = [];
const allElements = document.querySelectorAll('*');
// Skip on heavy pages — listener detection is too expensive
if (allElements.length > 10000) {
return null;
}
const elementsWithListeners = [];
for (const el of allElements) {
try {
const listeners = getEventListeners(el);
@@ -936,38 +946,57 @@ class DomService:
# Use pre-fetched all_frames to find the iframe's target (no redundant CDP call)
frame_id = node.get('frameId', None)
# Fallback: if frameId is missing or not in all_frames, try URL matching via
# the src attribute. This handles dynamically-injected iframes (e.g. HubSpot
# popups, chat widgets) where Chrome hasn't yet registered the frameId in the
# frame tree at DOM-snapshot time.
if (not frame_id or frame_id not in all_frames) and attributes:
src = attributes.get('src', '')
if src:
src_base = src.split('?')[0].rstrip('/')
for fid, finfo in all_frames.items():
frame_url = finfo.get('url', '').split('?')[0].rstrip('/')
if frame_url and frame_url == src_base:
frame_id = fid
self.logger.debug(f'Matched cross-origin iframe by src URL: {src!r} -> frameId={fid}')
break
iframe_document_target = None
if frame_id:
frame_info = all_frames.get(frame_id)
iframe_document_target = None
if frame_info and frame_info.get('frameTargetId'):
iframe_target_id = frame_info['frameTargetId']
# Use frameTargetId directly from all_frames — get_all_frames() already
# validated connectivity. Do NOT gate on session_manager.get_target():
# there is a race where _target_sessions is set (inside the lock in
# _handle_target_attached) before _targets is populated (outside the
# lock), so get_target() can transiently return None for a live target.
iframe_target = self.browser_session.session_manager.get_target(iframe_target_id)
if iframe_target:
iframe_document_target = {
'targetId': iframe_target.target_id,
'url': iframe_target.url,
'title': iframe_target.title,
'type': iframe_target.target_type,
}
else:
iframe_document_target = None
iframe_document_target = {
'targetId': iframe_target_id,
'url': iframe_target.url if iframe_target else frame_info.get('url', ''),
'title': iframe_target.title if iframe_target else frame_info.get('title', ''),
'type': iframe_target.target_type if iframe_target else 'iframe',
}
# if target actually exists in one of the frames, just recursively build the dom tree for it
if iframe_document_target:
self.logger.debug(
f'Getting content document for iframe {node.get("frameId", None)} at depth {iframe_depth + 1}'
)
content_document, _ = await self.get_dom_tree(
target_id=iframe_document_target['targetId'],
all_frames=all_frames,
# TODO: experiment with this values -> not sure whether the whole cross origin iframe should be ALWAYS included as soon as some part of it is visible or not.
# Current config: if the cross origin iframe is AT ALL visible, then just include everything inside of it!
# initial_html_frames=updated_html_frames,
initial_total_frame_offset=total_frame_offset,
iframe_depth=iframe_depth + 1,
)
dom_tree_node.content_document = content_document
dom_tree_node.content_document.parent_node = dom_tree_node
try:
content_document, _ = await self.get_dom_tree(
target_id=iframe_document_target['targetId'],
all_frames=all_frames,
# Current config: if the cross origin iframe is AT ALL visible, include everything inside it
initial_total_frame_offset=total_frame_offset,
iframe_depth=iframe_depth + 1,
)
dom_tree_node.content_document = content_document
dom_tree_node.content_document.parent_node = dom_tree_node
except Exception as e:
self.logger.debug(f'Failed to get DOM tree for cross-origin iframe {frame_id}: {e}')
return dom_tree_node
@@ -1075,10 +1104,12 @@ class DomService:
pagination_buttons: list[dict[str, str | int | bool]] = []
# Common pagination patterns to look for
# `«` and `»` are ambiguous across sites, so treat them only as prev/next
# fallback symbols and let word-based first/last signals win
next_patterns = ['next', '>', '»', '', 'siguiente', 'suivant', 'weiter', 'volgende']
prev_patterns = ['prev', 'previous', '<', '«', '', 'anterior', 'précédent', 'zurück', 'vorige']
first_patterns = ['first', '', '«', 'primera', 'première', 'erste', 'eerste']
last_patterns = ['last', '', '»', 'última', 'dernier', 'letzte', 'laatste']
first_patterns = ['first', '', 'primera', 'première', 'erste', 'eerste']
last_patterns = ['last', '', 'última', 'dernier', 'letzte', 'laatste']
for index, node in selector_map.items():
# Skip non-clickable elements
@@ -1104,18 +1135,18 @@ class DomService:
button_type: str | None = None
# Check for next button
if any(pattern in all_text for pattern in next_patterns):
button_type = 'next'
# Check for previous button
elif any(pattern in all_text for pattern in prev_patterns):
button_type = 'prev'
# Check for first button
elif any(pattern in all_text for pattern in first_patterns):
# Match specific first/last semantics before generic prev/next fallbacks.
if any(pattern in all_text for pattern in first_patterns):
button_type = 'first'
# Check for last button
elif any(pattern in all_text for pattern in last_patterns):
button_type = 'last'
# Check for next button
elif any(pattern in all_text for pattern in next_patterns):
button_type = 'next'
# Check for previous button
elif any(pattern in all_text for pattern in prev_patterns):
button_type = 'prev'
# Check for numeric page buttons (single or double digit)
elif text.isdigit() and len(text) <= 2 and role in ['button', 'link', '']:
button_type = 'page_number'

View File

@@ -1,5 +1,7 @@
import asyncio
import base64
import csv
import io
import os
import re
import shutil
@@ -164,12 +166,68 @@ class JsonFile(BaseFile):
class CsvFile(BaseFile):
"""CSV file implementation"""
"""CSV file implementation with automatic RFC 4180 normalization.
LLMs frequently produce malformed CSV (missing quotes around fields with commas,
inconsistent empty fields, unescaped internal quotes). This class parses the raw
content through Python's csv module on every write to guarantee well-formed output.
"""
@property
def extension(self) -> str:
return 'csv'
@staticmethod
def _normalize_csv(raw: str) -> str:
"""Parse and re-serialize CSV content to fix quoting, empty fields, and escaping.
Handles common LLM mistakes: unquoted fields containing commas,
unescaped quotes inside fields, inconsistent empty fields,
trailing/leading blank lines, and double-escaped JSON output
(literal backslash-n and backslash-quote instead of real newlines/quotes).
"""
stripped = raw.strip('\n\r')
if not stripped:
return raw
# Detect double-escaped LLM tool call output: if the content has no real
# newlines but contains literal \n sequences, the entire string is likely
# double-escaped JSON. Unescape \" → " first, then \n → newline.
if '\n' not in stripped and '\\n' in stripped:
stripped = stripped.replace('\\"', '"')
stripped = stripped.replace('\\n', '\n')
reader = csv.reader(io.StringIO(stripped))
rows: list[list[str]] = []
for row in reader:
# Skip completely empty rows (artifacts of blank lines)
if row:
rows.append(row)
if not rows:
return raw
out = io.StringIO()
writer = csv.writer(out, lineterminator='\n')
writer.writerows(rows)
# Strip trailing newline so callers (write_file action) control line endings
return out.getvalue().rstrip('\n')
def write_file_content(self, content: str) -> None:
"""Normalize CSV content before storing."""
self.update_content(self._normalize_csv(content))
def append_file_content(self, content: str) -> None:
"""Normalize the appended CSV rows and merge with existing content."""
normalized_new = self._normalize_csv(content)
if not normalized_new.strip('\n\r'):
return
existing = self.content
if existing and not existing.endswith('\n'):
existing += '\n'
combined = existing + normalized_new
self.update_content(self._normalize_csv(combined))
class JsonlFile(BaseFile):
"""JSONL (JSON Lines) file implementation"""
@@ -590,7 +648,7 @@ class FileSystem:
truncation_note = (
f'\n\n[Showing {len(pages_included)} of {num_pages} pages. '
f'Skipped pages: {skipped[:10]}{"..." if len(skipped) > 10 else ""}. '
f'Use read_long_content with a specific goal to find relevant sections.]'
f'Use extract with start_from_char to read further into the file.]'
)
else:
truncation_note = ''

View File

@@ -428,7 +428,7 @@ def main(
next_steps.append('4. Set up your API key in .env file or environment:\n', style='bold')
next_steps.append(' BROWSER_USE_API_KEY=your-key\n', style='dim')
next_steps.append(
' (Get your key at https://cloud.browser-use.com/dashboard/settings?tab=api-keys&new)\n\n',
' (Get your key at https://cloud.browser-use.com/dashboard/settings?tab=api-keys&new&utm_source=oss&utm_medium=cli)\n\n',
style='dim italic',
)
next_steps.append('5. Run your script:\n', style='bold')

View File

@@ -223,15 +223,29 @@ class ChatAnthropic(BaseChatModel):
stop_reason=response.stop_reason,
)
except Exception as e:
# If validation fails, try to parse it as JSON first
if isinstance(content_block.input, str):
data = json.loads(content_block.input)
return ChatInvokeCompletion(
completion=output_format.model_validate(data),
usage=usage,
stop_reason=response.stop_reason,
)
raise e
# If validation fails, try to fix common model output issues
_input = content_block.input
if isinstance(_input, str):
_input = json.loads(_input)
elif isinstance(_input, dict):
# Model sometimes double-serializes fields
for key, value in _input.items():
if isinstance(value, str) and value.startswith(('[', '{')):
try:
_input[key] = json.loads(value)
except json.JSONDecodeError:
cleaned = value.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
try:
_input[key] = json.loads(cleaned)
except json.JSONDecodeError:
pass
else:
raise
return ChatInvokeCompletion(
completion=output_format.model_validate(_input),
usage=usage,
stop_reason=response.stop_reason,
)
# If no tool use block found, raise an error
raise ValueError('Expected tool use in response but none found')

View File

@@ -222,14 +222,28 @@ class ChatAnthropicBedrock(ChatAWSBedrock):
try:
return ChatInvokeCompletion(completion=output_format.model_validate(content_block.input), usage=usage)
except Exception as e:
# If validation fails, try to parse it as JSON first
if isinstance(content_block.input, str):
data = json.loads(content_block.input)
return ChatInvokeCompletion(
completion=output_format.model_validate(data),
usage=usage,
)
raise e
# If validation fails, try to fix common model output issues
_input = content_block.input
if isinstance(_input, str):
_input = json.loads(_input)
elif isinstance(_input, dict):
# Model sometimes double-serializes fields
for key, value in _input.items():
if isinstance(value, str) and value.startswith(('[', '{')):
try:
_input[key] = json.loads(value)
except json.JSONDecodeError:
cleaned = value.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
try:
_input[key] = json.loads(cleaned)
except json.JSONDecodeError:
pass
else:
raise
return ChatInvokeCompletion(
completion=output_format.model_validate(_input),
usage=usage,
)
# If no tool use block found, raise an error
raise ValueError('Expected tool use in response but none found')

View File

@@ -9,6 +9,7 @@ from browser_use.llm.aws.serializer import AWSBedrockMessageSerializer
from browser_use.llm.base import BaseChatModel
from browser_use.llm.exceptions import ModelProviderError, ModelRateLimitError
from browser_use.llm.messages import BaseMessage
from browser_use.llm.schema import SchemaOptimizer
from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
if TYPE_CHECKING:
@@ -116,27 +117,14 @@ class ChatAWSBedrock(BaseChatModel):
def _format_tools_for_request(self, output_format: type[BaseModel]) -> list[dict[str, Any]]:
"""Format a Pydantic model as a tool for structured output."""
schema = output_format.model_json_schema()
# Convert Pydantic schema to Bedrock tool format
properties = {}
required = []
for prop_name, prop_info in schema.get('properties', {}).items():
properties[prop_name] = {
'type': prop_info.get('type', 'string'),
'description': prop_info.get('description', ''),
}
# Add required fields
required = schema.get('required', [])
schema = SchemaOptimizer.create_optimized_json_schema(output_format)
return [
{
'toolSpec': {
'name': f'extract_{output_format.__name__.lower()}',
'description': f'Extract information in the format of {output_format.__name__}',
'inputSchema': {'json': {'type': 'object', 'properties': properties, 'required': required}},
'inputSchema': {'json': schema},
}
}
]

View File

@@ -90,8 +90,8 @@ class ChatBrowserUse(BaseChatModel):
if not self.api_key:
raise ValueError(
'You need to set the BROWSER_USE_API_KEY environment variable. '
'Get your key at https://cloud.browser-use.com/new-api-key'
'BROWSER_USE_API_KEY is not set. To use ChatBrowserUse, get a key at:\n'
'https://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=chat_browser_use'
)
@property
@@ -275,9 +275,17 @@ class ChatBrowserUse(BaseChatModel):
status_code = e.response.status_code
if status_code == 401:
raise ModelProviderError(message=f'Invalid API key. {error_detail}', status_code=401, model=self.name)
raise ModelProviderError(
message=f'BROWSER_USE_API_KEY is invalid. Get a new key at:\nhttps://cloud.browser-use.com/new-api-key?utm_source=oss&utm_medium=chat_browser_use\n{error_detail}',
status_code=401,
model=self.name,
)
elif status_code == 402:
raise ModelProviderError(message=f'Insufficient credits. {error_detail}', status_code=402, model=self.name)
raise ModelProviderError(
message=f'Browser Use credits exhausted. Add more at:\nhttps://cloud.browser-use.com/billing?utm_source=oss&utm_medium=chat_browser_use\n{error_detail}',
status_code=402,
model=self.name,
)
elif status_code == 429:
raise ModelRateLimitError(message=f'Rate limit exceeded. {error_detail}', status_code=429, model=self.name)
elif status_code in {500, 502, 503, 504}:

View File

@@ -85,7 +85,7 @@ class ChatGoogle(BaseChatModel):
# Model configuration
model: VerifiedGeminiModels | str
temperature: float | None = 0.5
temperature: float | None = None
top_p: float | None = None
seed: int | None = None
thinking_budget: int | None = None # for Gemini 2.5: -1 for dynamic (default), 0 disables, or token count
@@ -222,6 +222,8 @@ class ChatGoogle(BaseChatModel):
# Apply model-specific configuration (these can override config)
if self.temperature is not None:
config['temperature'] = self.temperature
else:
config['temperature'] = 1.0 if 'gemini-3' in self.model else 0.5
# Add system instruction if present
if system_instruction:

View File

@@ -0,0 +1,3 @@
from browser_use.llm.litellm.chat import ChatLiteLLM
__all__ = ['ChatLiteLLM']

View File

@@ -0,0 +1,227 @@
"""
ChatLiteLLM - LiteLLM chat model wrapper.
Requires the `litellm` package to be installed separately:
pip install litellm
Note: litellm is NOT included as a dependency of browser-use.
"""
import logging
from dataclasses import dataclass, field
from typing import Any, TypeVar, overload
from pydantic import BaseModel
from browser_use.llm.base import BaseChatModel
from browser_use.llm.exceptions import ModelProviderError, ModelRateLimitError
from browser_use.llm.messages import BaseMessage
from browser_use.llm.schema import SchemaOptimizer
from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
from .serializer import LiteLLMMessageSerializer
logger = logging.getLogger(__name__)
T = TypeVar('T', bound=BaseModel)
@dataclass
class ChatLiteLLM(BaseChatModel):
model: str
api_key: str | None = None
api_base: str | None = None
temperature: float | None = 0.0
max_tokens: int | None = 4096
max_retries: int = 3
metadata: dict[str, Any] | None = None
_provider_name: str = field(default='', init=False, repr=False)
_clean_model: str = field(default='', init=False, repr=False)
def __post_init__(self) -> None:
"""Resolve provider info from the model string via litellm."""
try:
from litellm import get_llm_provider # type: ignore[reportMissingImports]
self._clean_model, self._provider_name, _, _ = get_llm_provider(self.model)
except Exception:
if '/' in self.model:
self._provider_name, self._clean_model = self.model.split('/', 1)
else:
self._provider_name = 'openai'
self._clean_model = self.model
logger.debug(
'ChatLiteLLM initialized: model=%s, provider=%s, clean=%s, api_base=%s',
self.model,
self._provider_name,
self._clean_model,
self.api_base or '(default)',
)
@property
def provider(self) -> str:
return self._provider_name or 'litellm'
@property
def name(self) -> str:
return self._clean_model or self.model
@staticmethod
def _parse_usage(response: Any) -> ChatInvokeUsage | None:
"""Extract token usage from a litellm response."""
usage = getattr(response, 'usage', None)
if usage is None:
return None
prompt_tokens = getattr(usage, 'prompt_tokens', 0) or 0
completion_tokens = getattr(usage, 'completion_tokens', 0) or 0
prompt_cached = getattr(usage, 'cache_read_input_tokens', None)
cache_creation = getattr(usage, 'cache_creation_input_tokens', None)
if prompt_cached is None:
details = getattr(usage, 'prompt_tokens_details', None)
if details:
prompt_cached = getattr(details, 'cached_tokens', None)
return ChatInvokeUsage(
prompt_tokens=prompt_tokens,
prompt_cached_tokens=int(prompt_cached) if prompt_cached is not None else None,
prompt_cache_creation_tokens=int(cache_creation) if cache_creation is not None else None,
prompt_image_tokens=None,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
@overload
async def ainvoke(
self,
messages: list[BaseMessage],
output_format: None = None,
**kwargs: Any,
) -> ChatInvokeCompletion[str]: ...
@overload
async def ainvoke(
self,
messages: list[BaseMessage],
output_format: type[T],
**kwargs: Any,
) -> ChatInvokeCompletion[T]: ...
async def ainvoke(
self,
messages: list[BaseMessage],
output_format: type[T] | None = None,
**kwargs: Any,
) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]:
from litellm import acompletion # type: ignore[reportMissingImports]
from litellm.exceptions import APIConnectionError, APIError, RateLimitError, Timeout # type: ignore[reportMissingImports]
from litellm.types.utils import ModelResponse # type: ignore[reportMissingImports]
litellm_messages = LiteLLMMessageSerializer.serialize(messages)
params: dict[str, Any] = {
'model': self.model,
'messages': litellm_messages,
'num_retries': self.max_retries,
}
if self.temperature is not None:
params['temperature'] = self.temperature
if self.max_tokens is not None:
params['max_tokens'] = self.max_tokens
if self.api_key:
params['api_key'] = self.api_key
if self.api_base:
params['api_base'] = self.api_base
if self.metadata:
params['metadata'] = self.metadata
if output_format is not None:
schema = SchemaOptimizer.create_optimized_json_schema(output_format)
params['response_format'] = {
'type': 'json_schema',
'json_schema': {
'name': 'agent_output',
'strict': True,
'schema': schema,
},
}
try:
raw_response = await acompletion(**params)
except RateLimitError as e:
raise ModelRateLimitError(
message=str(e),
model=self.name,
) from e
except Timeout as e:
raise ModelProviderError(
message=f'Request timed out: {e}',
model=self.name,
) from e
except APIConnectionError as e:
raise ModelProviderError(
message=str(e),
model=self.name,
) from e
except APIError as e:
status = getattr(e, 'status_code', 502) or 502
raise ModelProviderError(
message=str(e),
status_code=status,
model=self.name,
) from e
except ModelProviderError:
raise
except Exception as e:
raise ModelProviderError(
message=str(e),
model=self.name,
) from e
assert isinstance(raw_response, ModelResponse), f'Expected ModelResponse, got {type(raw_response)}'
response: ModelResponse = raw_response
choice = response.choices[0] if response.choices else None
if choice is None:
raise ModelProviderError(
message='Empty response: no choices returned by the model',
status_code=502,
model=self.name,
)
content = choice.message.content or ''
usage = self._parse_usage(response)
stop_reason = choice.finish_reason
thinking: str | None = None
msg_obj = choice.message
reasoning = getattr(msg_obj, 'reasoning_content', None)
if reasoning:
thinking = str(reasoning)
if output_format is not None:
if not content:
raise ModelProviderError(
message='Model returned empty content for structured output request',
status_code=500,
model=self.name,
)
parsed = output_format.model_validate_json(content)
return ChatInvokeCompletion(
completion=parsed,
thinking=thinking,
usage=usage,
stop_reason=stop_reason,
)
return ChatInvokeCompletion(
completion=content,
thinking=thinking,
usage=usage,
stop_reason=stop_reason,
)

View File

@@ -0,0 +1,120 @@
from typing import Any
from browser_use.llm.messages import (
AssistantMessage,
BaseMessage,
ContentPartImageParam,
ContentPartTextParam,
SystemMessage,
UserMessage,
)
class LiteLLMMessageSerializer:
@staticmethod
def _serialize_user_content(
content: str | list[ContentPartTextParam | ContentPartImageParam],
) -> str | list[dict[str, Any]]:
if isinstance(content, str):
return content
parts: list[dict[str, Any]] = []
for part in content:
if part.type == 'text':
parts.append(
{
'type': 'text',
'text': part.text,
}
)
elif part.type == 'image_url':
parts.append(
{
'type': 'image_url',
'image_url': {
'url': part.image_url.url,
'detail': part.image_url.detail,
},
}
)
return parts
@staticmethod
def _serialize_system_content(
content: str | list[ContentPartTextParam],
) -> str | list[dict[str, Any]]:
if isinstance(content, str):
return content
return [
{
'type': 'text',
'text': p.text,
}
for p in content
]
@staticmethod
def _serialize_assistant_content(
content: str | list[Any] | None,
) -> str | list[dict[str, Any]] | None:
if content is None:
return None
if isinstance(content, str):
return content
parts = []
for part in content:
if part.type == 'text':
parts.append(
{
'type': 'text',
'text': part.text,
}
)
elif part.type == 'refusal':
parts.append(
{
'type': 'text',
'text': f'[Refusal] {part.refusal}',
}
)
return parts
@staticmethod
def serialize(messages: list[BaseMessage]) -> list[dict[str, Any]]:
result: list[dict[str, Any]] = []
for msg in messages:
if isinstance(msg, UserMessage):
d: dict[str, Any] = {'role': 'user'}
d['content'] = LiteLLMMessageSerializer._serialize_user_content(msg.content)
if msg.name is not None:
d['name'] = msg.name
result.append(d)
elif isinstance(msg, SystemMessage):
d = {'role': 'system'}
d['content'] = LiteLLMMessageSerializer._serialize_system_content(msg.content)
if msg.name is not None:
d['name'] = msg.name
result.append(d)
elif isinstance(msg, AssistantMessage):
d = {'role': 'assistant'}
d['content'] = LiteLLMMessageSerializer._serialize_assistant_content(msg.content)
if msg.name is not None:
d['name'] = msg.name
if msg.tool_calls:
d['tool_calls'] = [
{
'id': tc.id,
'type': 'function',
'function': {
'name': tc.function.name,
'arguments': tc.function.arguments,
},
}
for tc in msg.tool_calls
]
result.append(d)
return result

View File

@@ -1,4 +1,5 @@
import json
import os
from collections.abc import Mapping
from dataclasses import dataclass, field
from typing import Any, Literal, TypeAlias, TypeVar, overload
@@ -26,15 +27,30 @@ ChatVercelModel: TypeAlias = Literal[
'alibaba/qwen-3-235b',
'alibaba/qwen-3-30b',
'alibaba/qwen-3-32b',
'alibaba/qwen3-235b-a22b-thinking',
'alibaba/qwen3-coder',
'alibaba/qwen3-coder-30b-a3b',
'alibaba/qwen3-coder-next',
'alibaba/qwen3-coder-plus',
'alibaba/qwen3-embedding-0.6b',
'alibaba/qwen3-embedding-4b',
'alibaba/qwen3-embedding-8b',
'alibaba/qwen3-max',
'alibaba/qwen3-max-preview',
'alibaba/qwen3-max-thinking',
'alibaba/qwen3-next-80b-a3b-instruct',
'alibaba/qwen3-next-80b-a3b-thinking',
'alibaba/qwen3-vl-instruct',
'alibaba/qwen3-vl-thinking',
'alibaba/qwen3.5-flash',
'alibaba/qwen3.5-plus',
'alibaba/wan-v2.5-t2v-preview',
'alibaba/wan-v2.6-i2v',
'alibaba/wan-v2.6-i2v-flash',
'alibaba/wan-v2.6-r2v',
'alibaba/wan-v2.6-r2v-flash',
'alibaba/wan-v2.6-t2v',
'amazon/nova-2-lite',
'amazon/nova-lite',
'amazon/nova-micro',
'amazon/nova-pro',
@@ -48,38 +64,69 @@ ChatVercelModel: TypeAlias = Literal[
'anthropic/claude-haiku-4.5',
'anthropic/claude-opus-4',
'anthropic/claude-opus-4.1',
'anthropic/claude-opus-4.5',
'anthropic/claude-opus-4.6',
'anthropic/claude-sonnet-4',
'anthropic/claude-sonnet-4.5',
'anthropic/claude-sonnet-4.6',
'arcee-ai/trinity-large-preview',
'arcee-ai/trinity-mini',
'bfl/flux-kontext-max',
'bfl/flux-kontext-pro',
'bfl/flux-pro-1.0-fill',
'bfl/flux-pro-1.1',
'bfl/flux-pro-1.1-ultra',
'bytedance/seed-1.6',
'bytedance/seed-1.8',
'bytedance/seedance-v1.0-lite-i2v',
'bytedance/seedance-v1.0-lite-t2v',
'bytedance/seedance-v1.0-pro',
'bytedance/seedance-v1.0-pro-fast',
'bytedance/seedance-v1.5-pro',
'cohere/command-a',
'cohere/command-r',
'cohere/command-r-plus',
'cohere/embed-v4.0',
'deepseek/deepseek-r1',
'deepseek/deepseek-r1-distill-llama-70b',
'deepseek/deepseek-v3',
'deepseek/deepseek-v3.1',
'deepseek/deepseek-v3.1-base',
'deepseek/deepseek-v3.1-terminus',
'deepseek/deepseek-v3.2-exp',
'deepseek/deepseek-v3.2-exp-thinking',
'deepseek/deepseek-v3.2',
'deepseek/deepseek-v3.2-thinking',
'google/gemini-2.0-flash',
'google/gemini-2.0-flash-lite',
'google/gemini-2.5-flash',
'google/gemini-2.5-flash-image',
'google/gemini-2.5-flash-image-preview',
'google/gemini-2.5-flash-lite',
'google/gemini-2.5-flash-lite-preview-09-2025',
'google/gemini-2.5-flash-preview-09-2025',
'google/gemini-2.5-pro',
'google/gemini-3-flash',
'google/gemini-3-pro-image',
'google/gemini-3-pro-preview',
'google/gemini-3.1-flash-image-preview',
'google/gemini-3.1-flash-lite-preview',
'google/gemini-3.1-pro-preview',
'google/gemini-embedding-001',
'google/gemma-2-9b',
'google/imagen-4.0-fast-generate-001',
'google/imagen-4.0-generate-001',
'google/imagen-4.0-ultra-generate-001',
'google/text-embedding-005',
'google/text-multilingual-embedding-002',
'google/veo-3.0-fast-generate-001',
'google/veo-3.0-generate-001',
'google/veo-3.1-fast-generate-001',
'google/veo-3.1-generate-001',
'inception/mercury-2',
'inception/mercury-coder-small',
'klingai/kling-v2.5-turbo-i2v',
'klingai/kling-v2.5-turbo-t2v',
'klingai/kling-v2.6-i2v',
'klingai/kling-v2.6-motion-control',
'klingai/kling-v2.6-t2v',
'klingai/kling-v3.0-i2v',
'klingai/kling-v3.0-t2v',
'kwaipilot/kat-coder-pro-v1',
'meituan/longcat-flash-chat',
'meituan/longcat-flash-thinking',
'meta/llama-3-70b',
'meta/llama-3-8b',
'meta/llama-3.1-70b',
'meta/llama-3.1-8b',
'meta/llama-3.2-11b',
@@ -89,27 +136,40 @@ ChatVercelModel: TypeAlias = Literal[
'meta/llama-3.3-70b',
'meta/llama-4-maverick',
'meta/llama-4-scout',
'minimax/minimax-m2',
'minimax/minimax-m2.1',
'minimax/minimax-m2.1-lightning',
'minimax/minimax-m2.5',
'minimax/minimax-m2.5-highspeed',
'mistral/codestral',
'mistral/codestral-embed',
'mistral/devstral-2',
'mistral/devstral-small',
'mistral/devstral-small-2',
'mistral/magistral-medium',
'mistral/magistral-medium-2506',
'mistral/magistral-small',
'mistral/magistral-small-2506',
'mistral/ministral-14b',
'mistral/ministral-3b',
'mistral/ministral-8b',
'mistral/mistral-embed',
'mistral/mistral-large',
'mistral/mistral-large-3',
'mistral/mistral-medium',
'mistral/mistral-nemo',
'mistral/mistral-small',
'mistral/mixtral-8x22b-instruct',
'mistral/pixtral-12b',
'mistral/pixtral-large',
'moonshotai/kimi-k2',
'moonshotai/kimi-k2-0905',
'moonshotai/kimi-k2-thinking',
'moonshotai/kimi-k2-thinking-turbo',
'moonshotai/kimi-k2-turbo',
'moonshotai/kimi-k2.5',
'morph/morph-v3-fast',
'morph/morph-v3-large',
'nvidia/nemotron-3-nano-30b-a3b',
'nvidia/nemotron-nano-12b-v2-vl',
'nvidia/nemotron-nano-9b-v2',
'openai/gpt-3.5-turbo',
'openai/gpt-3.5-turbo-instruct',
'openai/gpt-4-turbo',
@@ -118,16 +178,37 @@ ChatVercelModel: TypeAlias = Literal[
'openai/gpt-4.1-nano',
'openai/gpt-4o',
'openai/gpt-4o-mini',
'openai/gpt-4o-mini-search-preview',
'openai/gpt-5',
'openai/gpt-5-chat',
'openai/gpt-5-codex',
'openai/gpt-5-mini',
'openai/gpt-5-nano',
'openai/gpt-5-pro',
'openai/gpt-5.1-codex',
'openai/gpt-5.1-codex-max',
'openai/gpt-5.1-codex-mini',
'openai/gpt-5.1-instant',
'openai/gpt-5.1-thinking',
'openai/gpt-5.2',
'openai/gpt-5.2-chat',
'openai/gpt-5.2-codex',
'openai/gpt-5.2-pro',
'openai/gpt-5.3-chat',
'openai/gpt-5.3-codex',
'openai/gpt-5.4',
'openai/gpt-5.4-pro',
'openai/gpt-image-1',
'openai/gpt-image-1-mini',
'openai/gpt-image-1.5',
'openai/gpt-oss-120b',
'openai/gpt-oss-20b',
'openai/gpt-oss-safeguard-20b',
'openai/o1',
'openai/o3',
'openai/o3-deep-research',
'openai/o3-mini',
'openai/o3-pro',
'openai/o4-mini',
'openai/text-embedding-3-large',
'openai/text-embedding-3-small',
@@ -136,6 +217,11 @@ ChatVercelModel: TypeAlias = Literal[
'perplexity/sonar-pro',
'perplexity/sonar-reasoning',
'perplexity/sonar-reasoning-pro',
'prime-intellect/intellect-3',
'recraft/recraft-v2',
'recraft/recraft-v3',
'recraft/recraft-v4',
'recraft/recraft-v4-pro',
'stealth/sonoma-dusk-alpha',
'stealth/sonoma-sky-alpha',
'vercel/v0-1.0-md',
@@ -143,11 +229,13 @@ ChatVercelModel: TypeAlias = Literal[
'voyage/voyage-3-large',
'voyage/voyage-3.5',
'voyage/voyage-3.5-lite',
'voyage/voyage-4',
'voyage/voyage-4-large',
'voyage/voyage-4-lite',
'voyage/voyage-code-2',
'voyage/voyage-code-3',
'voyage/voyage-finance-2',
'voyage/voyage-law-2',
'xai/grok-2',
'xai/grok-2-vision',
'xai/grok-3',
'xai/grok-3-fast',
@@ -156,11 +244,25 @@ ChatVercelModel: TypeAlias = Literal[
'xai/grok-4',
'xai/grok-4-fast-non-reasoning',
'xai/grok-4-fast-reasoning',
'xai/grok-4.1-fast-non-reasoning',
'xai/grok-4.1-fast-reasoning',
'xai/grok-4.20-multi-agent-beta',
'xai/grok-4.20-non-reasoning-beta',
'xai/grok-4.20-reasoning-beta',
'xai/grok-code-fast-1',
'xai/grok-imagine-image',
'xai/grok-imagine-image-pro',
'xai/grok-imagine-video',
'xiaomi/mimo-v2-flash',
'zai/glm-4.5',
'zai/glm-4.5-air',
'zai/glm-4.5v',
'zai/glm-4.6',
'zai/glm-4.6v',
'zai/glm-4.6v-flash',
'zai/glm-4.7',
'zai/glm-4.7-flashx',
'zai/glm-5',
]
@@ -181,7 +283,8 @@ class ChatVercel(BaseChatModel):
Args:
model: The model identifier
api_key: Your Vercel API key
api_key: Your Vercel AI Gateway API key. If not provided, falls back to
AI_GATEWAY_API_KEY or VERCEL_OIDC_TOKEN environment variables.
base_url: The Vercel AI Gateway endpoint (defaults to https://ai-gateway.vercel.sh/v1)
temperature: Sampling temperature (0-2)
max_tokens: Maximum tokens to generate
@@ -191,6 +294,14 @@ class ChatVercel(BaseChatModel):
max_retries: Maximum number of retries for failed requests
provider_options: Provider routing options for the gateway. Use this to control which
providers are used and in what order. Example: {'gateway': {'order': ['vertex', 'anthropic']}}
reasoning: Optional provider-specific reasoning configuration. Merged into
providerOptions under the appropriate provider key. Example for Anthropic:
{'anthropic': {'thinking': {'type': 'adaptive'}}}. Example for OpenAI:
{'openai': {'reasoningEffort': 'high', 'reasoningSummary': 'detailed'}}.
model_fallbacks: Optional list of fallback model IDs tried in order if the primary
model fails. Passed as providerOptions.gateway.models.
caching: Optional caching mode for the gateway. Currently supports 'auto', which
enables provider-specific prompt caching via providerOptions.gateway.caching.
"""
# Model configuration
@@ -206,8 +317,11 @@ class ChatVercel(BaseChatModel):
'o3',
'o4',
'gpt-oss',
'gpt-5.2-pro',
'gpt-5.4-pro',
'deepseek-r1',
'qwen3-next-80b-a3b-thinking',
'-thinking',
'perplexity/sonar-reasoning',
]
)
@@ -221,6 +335,9 @@ class ChatVercel(BaseChatModel):
http_client: httpx.AsyncClient | None = None
_strict_response_validation: bool = False
provider_options: dict[str, Any] | None = None
reasoning: dict[str, dict[str, Any]] | None = None
model_fallbacks: list[str] | None = None
caching: Literal['auto'] | None = None
# Static
@property
@@ -229,8 +346,10 @@ class ChatVercel(BaseChatModel):
def _get_client_params(self) -> dict[str, Any]:
"""Prepare client parameters dictionary."""
api_key = self.api_key or os.getenv('AI_GATEWAY_API_KEY') or os.getenv('VERCEL_OIDC_TOKEN')
base_params = {
'api_key': self.api_key,
'api_key': api_key,
'base_url': self.base_url,
'timeout': self.timeout,
'max_retries': self.max_retries,
@@ -387,8 +506,36 @@ class ChatVercel(BaseChatModel):
model_params['max_tokens'] = self.max_tokens
if self.top_p is not None:
model_params['top_p'] = self.top_p
extra_body: dict[str, Any] = {}
provider_opts: dict[str, Any] = {}
if self.provider_options:
model_params['extra_body'] = {'providerOptions': self.provider_options}
provider_opts.update(self.provider_options)
if self.reasoning:
# Merge provider-specific reasoning options (ex: {'anthropic': {'thinking': ...}})
for provider_name, opts in self.reasoning.items():
existing = provider_opts.get(provider_name, {})
existing.update(opts)
provider_opts[provider_name] = existing
gateway_opts: dict[str, Any] = provider_opts.get('gateway', {})
if self.model_fallbacks:
gateway_opts['models'] = self.model_fallbacks
if self.caching:
gateway_opts['caching'] = self.caching
if gateway_opts:
provider_opts['gateway'] = gateway_opts
if provider_opts:
extra_body['providerOptions'] = provider_opts
if extra_body:
model_params['extra_body'] = extra_body
if output_format is None:
# Return string response
@@ -439,14 +586,10 @@ class ChatVercel(BaseChatModel):
vercel_messages = VercelMessageSerializer.serialize_messages(modified_messages)
request_params = model_params.copy()
if self.provider_options:
request_params['extra_body'] = {'providerOptions': self.provider_options}
response = await self.get_client().chat.completions.create(
model=self.model,
messages=vercel_messages,
**request_params,
**model_params,
)
content = response.choices[0].message.content if response.choices else None
@@ -491,10 +634,6 @@ class ChatVercel(BaseChatModel):
'schema': schema,
}
request_params = model_params.copy()
if self.provider_options:
request_params['extra_body'] = {'providerOptions': self.provider_options}
response = await self.get_client().chat.completions.create(
model=self.model,
messages=vercel_messages,
@@ -502,7 +641,7 @@ class ChatVercel(BaseChatModel):
json_schema=response_format_schema,
type='json_schema',
),
**request_params,
**model_params,
)
content = response.choices[0].message.content if response.choices else None

View File

@@ -223,9 +223,7 @@ def setup_logging(stream=None, log_level=None, force_setup=False, debug_log_file
'trafilatura.htmlprocessing',
'trafilatura',
'groq',
'portalocker',
'google_genai',
'portalocker.utils',
'websockets', # General websockets (but not websockets.client which we need)
]
for logger_name in third_party_loggers:

View File

@@ -329,6 +329,7 @@ class MCPClient:
return ActionResult(
extracted_content=extracted_content,
long_term_memory=f"Used MCP tool '{tool.name}' from {self.server_name}",
include_extracted_content_only_once=True,
)
except Exception as e:
@@ -372,6 +373,7 @@ class MCPClient:
return ActionResult(
extracted_content=extracted_content,
long_term_memory=f"Used MCP tool '{tool.name}' from {self.server_name}",
include_extracted_content_only_once=True,
)
except Exception as e:

View File

@@ -232,13 +232,21 @@ class BrowserUseServer:
),
types.Tool(
name='browser_click',
description='Click an element on the page by its index',
description='Click an element by index or at specific viewport coordinates. Use index for elements from browser_get_state, or coordinate_x/coordinate_y for pixel-precise clicking.',
inputSchema={
'type': 'object',
'properties': {
'index': {
'type': 'integer',
'description': 'The index of the link or element to click (from browser_get_state)',
'description': 'The index of the element to click (from browser_get_state). Provide this OR coordinate_x+coordinate_y.',
},
'coordinate_x': {
'type': 'integer',
'description': 'X coordinate in pixels from the left edge of the viewport. Must be used together with coordinate_y. Provide this OR index.',
},
'coordinate_y': {
'type': 'integer',
'description': 'Y coordinate in pixels from the top edge of the viewport. Must be used together with coordinate_x. Provide this OR index.',
},
'new_tab': {
'type': 'boolean',
@@ -246,12 +254,11 @@ class BrowserUseServer:
'default': False,
},
},
'required': ['index'],
},
),
types.Tool(
name='browser_type',
description='Type text into an input field',
description='Type text into an input field. Clears existing text by default; pass text="" to clear only.',
inputSchema={
'type': 'object',
'properties': {
@@ -259,7 +266,10 @@ class BrowserUseServer:
'type': 'integer',
'description': 'The index of the input element (from browser_get_state)',
},
'text': {'type': 'string', 'description': 'The text to type'},
'text': {
'type': 'string',
'description': 'The text to type. Pass an empty string ("") to clear the field without typing.',
},
},
'required': ['index', 'text'],
},
@@ -294,6 +304,33 @@ class BrowserUseServer:
'required': ['query'],
},
),
types.Tool(
name='browser_get_html',
description='Get the raw HTML of the current page or a specific element by CSS selector',
inputSchema={
'type': 'object',
'properties': {
'selector': {
'type': 'string',
'description': 'Optional CSS selector to get HTML of a specific element. If omitted, returns full page HTML.',
},
},
},
),
types.Tool(
name='browser_screenshot',
description='Take a screenshot of the current page. Returns viewport metadata as text and the screenshot as an image.',
inputSchema={
'type': 'object',
'properties': {
'full_page': {
'type': 'boolean',
'description': 'Whether to capture the full scrollable page or just the visible viewport',
'default': False,
},
},
},
),
types.Tool(
name='browser_scroll',
description='Scroll the page',
@@ -361,8 +398,7 @@ class BrowserUseServer:
},
'model': {
'type': 'string',
'description': 'LLM model to use (e.g., gpt-4o, claude-3-opus-20240229)',
'default': 'gpt-4o',
'description': 'LLM model to use (e.g., gpt-4o, claude-3-opus-20240229). Defaults to the configured model.',
},
'allowed_domains': {
'type': 'array',
@@ -417,12 +453,14 @@ class BrowserUseServer:
return []
@self.server.call_tool()
async def handle_call_tool(name: str, arguments: dict[str, Any] | None) -> list[types.TextContent]:
async def handle_call_tool(name: str, arguments: dict[str, Any] | None) -> list[types.TextContent | types.ImageContent]:
"""Handle tool execution."""
start_time = time.time()
error_msg = None
try:
result = await self._execute_tool(name, arguments or {})
if isinstance(result, list):
return result
return [types.TextContent(type='text', text=result)]
except Exception as e:
error_msg = str(e)
@@ -441,15 +479,17 @@ class BrowserUseServer:
)
)
async def _execute_tool(self, tool_name: str, arguments: dict[str, Any]) -> str:
"""Execute a browser-use tool."""
async def _execute_tool(
self, tool_name: str, arguments: dict[str, Any]
) -> str | list[types.TextContent | types.ImageContent]:
"""Execute a browser-use tool. Returns str for most tools, or a content list for tools with image output."""
# Agent-based tools
if tool_name == 'retry_with_browser_use_agent':
return await self._retry_with_browser_use_agent(
task=arguments['task'],
max_steps=arguments.get('max_steps', 100),
model=arguments.get('model', 'gpt-4o'),
model=arguments.get('model'),
allowed_domains=arguments.get('allowed_domains', []),
use_vision=arguments.get('use_vision', True),
)
@@ -474,13 +514,32 @@ class BrowserUseServer:
return await self._navigate(arguments['url'], arguments.get('new_tab', False))
elif tool_name == 'browser_click':
return await self._click(arguments['index'], arguments.get('new_tab', False))
return await self._click(
index=arguments.get('index'),
coordinate_x=arguments.get('coordinate_x'),
coordinate_y=arguments.get('coordinate_y'),
new_tab=arguments.get('new_tab', False),
)
elif tool_name == 'browser_type':
return await self._type_text(arguments['index'], arguments['text'])
elif tool_name == 'browser_get_state':
return await self._get_browser_state(arguments.get('include_screenshot', False))
state_json, screenshot_b64 = await self._get_browser_state(arguments.get('include_screenshot', False))
content: list[types.TextContent | types.ImageContent] = [types.TextContent(type='text', text=state_json)]
if screenshot_b64:
content.append(types.ImageContent(type='image', data=screenshot_b64, mimeType='image/png'))
return content
elif tool_name == 'browser_get_html':
return await self._get_html(arguments.get('selector'))
elif tool_name == 'browser_screenshot':
meta_json, screenshot_b64 = await self._screenshot(arguments.get('full_page', False))
content: list[types.TextContent | types.ImageContent] = [types.TextContent(type='text', text=meta_json)]
if screenshot_b64:
content.append(types.ImageContent(type='image', data=screenshot_b64, mimeType='image/png'))
return content
elif tool_name == 'browser_extract_content':
return await self._extract_content(arguments['query'], arguments.get('extract_links', False))
@@ -575,7 +634,7 @@ class BrowserUseServer:
self,
task: str,
max_steps: int = 100,
model: str = 'gpt-4o',
model: str | None = None,
allowed_domains: list[str] | None = None,
use_vision: bool = True,
) -> str:
@@ -588,27 +647,25 @@ class BrowserUseServer:
# Get LLM provider
model_provider = llm_config.get('model_provider') or os.getenv('MODEL_PROVIDER')
# 如果model_provider不等于空且等Bedrock
# Get Bedrock-specific config
if model_provider and model_provider.lower() == 'bedrock':
llm_model = llm_config.get('model') or os.getenv('MODEL') or 'us.anthropic.claude-sonnet-4-20250514-v1:0'
aws_region = llm_config.get('region') or os.getenv('REGION')
if not aws_region:
aws_region = 'us-east-1'
aws_sso_auth = llm_config.get('aws_sso_auth', False)
llm = ChatAWSBedrock(
model=llm_model, # or any Bedrock model
aws_region=aws_region,
aws_sso_auth=True,
aws_sso_auth=aws_sso_auth,
)
else:
api_key = llm_config.get('api_key') or os.getenv('OPENAI_API_KEY')
if not api_key:
return 'Error: OPENAI_API_KEY not set in config or environment'
# Override model if provided in tool call
if model != llm_config.get('model', 'gpt-4o'):
llm_model = model
else:
llm_model = llm_config.get('model', 'gpt-4o')
# Use explicit model from tool call, otherwise fall back to configured default
llm_model = model or llm_config.get('model', 'gpt-4o')
base_url = llm_config.get('base_url', None)
kwargs = {}
@@ -693,14 +750,34 @@ class BrowserUseServer:
await event
return f'Navigated to: {url}'
async def _click(self, index: int, new_tab: bool = False) -> str:
"""Click an element by index."""
async def _click(
self,
index: int | None = None,
coordinate_x: int | None = None,
coordinate_y: int | None = None,
new_tab: bool = False,
) -> str:
"""Click an element by index or at viewport coordinates."""
if not self.browser_session:
return 'Error: No browser session active'
# Update session activity
self._update_session_activity(self.browser_session.id)
# Coordinate-based clicking
if coordinate_x is not None and coordinate_y is not None:
from browser_use.browser.events import ClickCoordinateEvent
event = self.browser_session.event_bus.dispatch(
ClickCoordinateEvent(coordinate_x=coordinate_x, coordinate_y=coordinate_y)
)
await event
return f'Clicked at coordinates ({coordinate_x}, {coordinate_y})'
# Index-based clicking
if index is None:
return 'Error: Provide either index or both coordinate_x and coordinate_y'
# Get the element
element = await self.browser_session.get_dom_element_by_index(index)
if not element:
@@ -730,7 +807,6 @@ class BrowserUseServer:
return f'Clicked element {index} and opened in new tab {full_url[:20]}...'
else:
# For non-link elements, just do a normal click
# Opening in new tab without href is not reliably supported
from browser_use.browser.events import ClickElementEvent
event = self.browser_session.event_bus.dispatch(ClickElementEvent(node=element))
@@ -790,23 +866,39 @@ class BrowserUseServer:
else:
return f"Typed '{text}' into element {index}"
async def _get_browser_state(self, include_screenshot: bool = False) -> str:
"""Get current browser state."""
async def _get_browser_state(self, include_screenshot: bool = False) -> tuple[str, str | None]:
"""Get current browser state. Returns (state_json, screenshot_b64 | None)."""
if not self.browser_session:
return 'Error: No browser session active'
return 'Error: No browser session active', None
state = await self.browser_session.get_browser_state_summary()
result = {
result: dict[str, Any] = {
'url': state.url,
'title': state.title,
'tabs': [{'url': tab.url, 'title': tab.title} for tab in state.tabs],
'interactive_elements': [],
}
# Add viewport info so the LLM knows the coordinate space
if state.page_info:
pi = state.page_info
result['viewport'] = {
'width': pi.viewport_width,
'height': pi.viewport_height,
}
result['page'] = {
'width': pi.page_width,
'height': pi.page_height,
}
result['scroll'] = {
'x': pi.scroll_x,
'y': pi.scroll_y,
}
# Add interactive elements with their indices
for index, element in state.dom_state.selector_map.items():
elem_info = {
elem_info: dict[str, Any] = {
'index': index,
'tag': element.tag_name,
'text': element.get_all_children_text(max_depth=2)[:100],
@@ -817,10 +909,69 @@ class BrowserUseServer:
elem_info['href'] = element.attributes['href']
result['interactive_elements'].append(elem_info)
# Return screenshot separately as ImageContent instead of embedding base64 in JSON
screenshot_b64 = None
if include_screenshot and state.screenshot:
result['screenshot'] = state.screenshot
screenshot_b64 = state.screenshot
# Include viewport dimensions in JSON so LLM can map pixels to coordinates
if state.page_info:
result['screenshot_dimensions'] = {
'width': state.page_info.viewport_width,
'height': state.page_info.viewport_height,
}
return json.dumps(result, indent=2)
return json.dumps(result, indent=2), screenshot_b64
async def _get_html(self, selector: str | None = None) -> str:
"""Get raw HTML of the page or a specific element."""
if not self.browser_session:
return 'Error: No browser session active'
self._update_session_activity(self.browser_session.id)
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=None, focus=False)
if not cdp_session:
return 'Error: No active CDP session'
if selector:
js = (
f'(function(){{ const el = document.querySelector({json.dumps(selector)}); return el ? el.outerHTML : null; }})()'
)
else:
js = 'document.documentElement.outerHTML'
result = await cdp_session.cdp_client.send.Runtime.evaluate(
params={'expression': js, 'returnByValue': True},
session_id=cdp_session.session_id,
)
html = result.get('result', {}).get('value')
if html is None:
return f'No element found for selector: {selector}' if selector else 'Error: Could not get page HTML'
return html
async def _screenshot(self, full_page: bool = False) -> tuple[str, str | None]:
"""Take a screenshot. Returns (metadata_json, screenshot_b64 | None)."""
if not self.browser_session:
return 'Error: No browser session active', None
import base64
self._update_session_activity(self.browser_session.id)
data = await self.browser_session.take_screenshot(full_page=full_page)
b64 = base64.b64encode(data).decode()
# Return screenshot separately as ImageContent instead of embedding base64 in JSON
state = await self.browser_session.get_browser_state_summary()
result: dict[str, Any] = {
'size_bytes': len(data),
}
if state.page_info:
result['viewport'] = {
'width': state.page_info.viewport_width,
'height': state.page_info.viewport_height,
}
return json.dumps(result), b64
async def _extract_content(self, query: str, extract_links: bool = False) -> str:
"""Extract content from current page."""
@@ -1075,19 +1226,25 @@ class BrowserUseServer:
# Start the cleanup task
await self._start_cleanup_task()
if sys.stdin is None:
raise RuntimeError('MCP stdio transport requires stdin, but this process was launched without one.')
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
await self.server.run(
read_stream,
write_stream,
InitializationOptions(
server_name='browser-use',
server_version='0.1.0',
capabilities=self.server.get_capabilities(
notification_options=NotificationOptions(),
experimental_capabilities={},
try:
await self.server.run(
read_stream,
write_stream,
InitializationOptions(
server_name='browser-use',
server_version='0.1.0',
capabilities=self.server.get_capabilities(
notification_options=NotificationOptions(),
experimental_capabilities={},
),
),
),
)
)
except BrokenPipeError:
logger.warning('MCP client disconnected while writing to stdio; shutting down server cleanly.')
async def main(session_timeout_minutes: int = 10):

View File

@@ -24,20 +24,10 @@ curl -fsSL https://browser-use.com/cli/install.sh | bash
& "C:\Program Files\Git\bin\bash.exe" -c 'curl -fsSL https://browser-use.com/cli/install.sh | bash'
```
### Installation Modes
```bash
curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full # All modes
curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --local-only # Local browser only
curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only # Cloud browser only
curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --api-key bu_xxx # With API key
```
### Post-Install
```bash
browser-use doctor # Validate installation
browser-use setup # Run setup wizard (optional)
browser-use setup --mode local|remote|full # Non-interactive setup
browser-use setup --api-key bu_xxx --yes # With API key, skip prompts
```
### Generate Templates
@@ -62,13 +52,10 @@ If you prefer not to use the one-line installer:
# 1. Install the package
uv pip install browser-use
# 2. Install Chromium (for local browser mode)
# 2. Install Chromium
browser-use install
# 3. Configure API key (for remote mode)
export BROWSER_USE_API_KEY=your_key # or $env:BROWSER_USE_API_KEY on Windows
# 4. Validate
# 3. Validate
browser-use doctor
```
@@ -106,11 +93,20 @@ browser-use open https://example.com
# Visible browser window
browser-use --headed open https://example.com
# Use your real Chrome (with existing logins/cookies)
browser-use --browser real open https://gmail.com
# Use your real Chrome with Default profile (with existing logins/cookies)
browser-use --profile "Default" open https://gmail.com
# Cloud browser (requires BROWSER_USE_API_KEY)
browser-use --browser remote open https://example.com
# Use a specific Chrome profile
browser-use --profile "Profile 1" open https://gmail.com
# Auto-discover and connect to running Chrome
browser-use --connect open https://example.com
# Connect to an existing browser via CDP URL
browser-use --cdp-url http://localhost:9222 open https://example.com
# WebSocket CDP URL also works
browser-use --cdp-url ws://localhost:9222/devtools/browser/... state
```
## All Commands
@@ -135,11 +131,13 @@ browser-use --browser remote open https://example.com
| Command | Description |
|---------|-------------|
| `click <index>` | Click element by index |
| `click <x> <y>` | Click at pixel coordinates |
| `type "text"` | Type into focused element |
| `input <index> "text"` | Click element, then type |
| `keys "Enter"` | Send keyboard keys |
| `keys "Control+a"` | Send key combination |
| `select <index> "value"` | Select dropdown option |
| `upload <index> <path>` | Upload file to file input element |
| `hover <index>` | Hover over element |
| `dblclick <index>` | Double-click element |
| `rightclick <index>` | Right-click element |
@@ -147,9 +145,10 @@ browser-use --browser remote open https://example.com
### Tabs
| Command | Description |
|---------|-------------|
| `switch <tab>` | Switch to tab by index |
| `close-tab` | Close current tab |
| `close-tab <tab>` | Close specific tab |
| `tab list` | List all tabs |
| `tab new [url]` | Open new tab |
| `tab switch <index>` | Switch to tab by index |
| `tab close [index...]` | Close tab(s) (current if no index) |
### Cookies
| Command | Description |
@@ -188,7 +187,7 @@ browser-use --browser remote open https://example.com
| Command | Description |
|---------|-------------|
| `eval "js code"` | Execute JavaScript |
| `extract "query"` | Extract data with LLM |
| `extract "query"` | Extract data with LLM (not yet implemented) |
### Python (Persistent Session)
```bash
@@ -200,88 +199,45 @@ browser-use python --reset # Clear namespace
browser-use python --file script.py # Run Python file
```
## Agent Tasks
## Cloud API
Run AI-powered browser automation tasks.
### Local Mode
```bash
browser-use run "Fill the contact form with test data"
browser-use run "Extract all product prices" --max-steps 50
browser-use run "task" --llm gpt-4o # Specify LLM model
```
Requires an LLM API key (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, etc.).
### Remote Mode (Cloud)
```bash
browser-use -b remote run "Search for AI news" # US proxy default
browser-use -b remote run "task" --llm gpt-4o # Specify LLM
browser-use -b remote run "task" --proxy-country gb # UK proxy
browser-use -b remote run "task" --session-id <id> # Reuse session
browser-use -b remote run "task" --no-wait # Async (returns task ID)
browser-use -b remote run "task" --wait # Wait for completion
browser-use -b remote run "task" --stream # Stream output
browser-use -b remote run "task" --flash # Fast mode
browser-use -b remote run "task" --keep-alive # Keep session alive
browser-use -b remote run "task" --thinking # Extended reasoning
browser-use -b remote run "task" --vision # Enable vision (default)
browser-use -b remote run "task" --no-vision # Disable vision
browser-use -b remote run "task" --profile <id> # Use cloud profile
# Task configuration
browser-use -b remote run "task" --start-url https://example.com # Start from URL
browser-use -b remote run "task" --allowed-domain example.com # Restrict navigation (repeatable)
browser-use -b remote run "task" --metadata key=value # Task metadata (repeatable)
browser-use -b remote run "task" --secret API_KEY=xxx # Task secrets (repeatable)
browser-use -b remote run "task" --skill-id skill-123 # Enable skills (repeatable)
# Structured output and evaluation
browser-use -b remote run "task" --structured-output '{"type":"object"}' # JSON schema
browser-use -b remote run "task" --judge # Enable judge mode
browser-use -b remote run "task" --judge-ground-truth "answer" # Expected answer
```
Requires `BROWSER_USE_API_KEY`.
## Task Management (Remote Mode)
Manage cloud tasks when using `--browser remote`.
Generic REST passthrough to the Browser-Use Cloud API, plus cloud browser provisioning.
| Command | Description |
|---------|-------------|
| `task list` | List recent tasks |
| `task list --status running` | Filter by status |
| `task list --session <id>` | Filter by session ID |
| `task status <id>` | Get task status (latest step only) |
| `task status <id> -c` | Compact: all steps with reasoning |
| `task status <id> -v` | Verbose: full details |
| `task status <id> --last 5` | Show last 5 steps |
| `task status <id> --step 3` | Show specific step number |
| `task status <id> --reverse` | Show steps newest first |
| `task stop <id>` | Stop running task |
| `task logs <id>` | Get execution logs |
| `cloud connect` | Provision cloud browser and connect (zero-config, auto-manages profile) |
| `cloud login <api-key>` | Save API key |
| `cloud logout` | Remove API key |
| `cloud v2 GET <path>` | GET request to API v2 |
| `cloud v2 POST <path> '<json>'` | POST request to API v2 |
| `cloud v3 POST <path> '<json>'` | POST request to API v3 |
| `cloud v2 poll <task-id>` | Poll task until done |
| `cloud v2 --help` | Show API v2 endpoints (from OpenAPI spec) |
| `cloud v3 --help` | Show API v3 endpoints |
## Cloud Sessions (Remote Mode)
```bash
# Save API key to ~/.browser-use/config.json
browser-use cloud login sk-abc123...
Manage cloud browser sessions.
# Provision a cloud browser and connect
browser-use cloud connect
browser-use state # works normally
browser-use close # disconnects AND stops cloud browser
| Command | Description |
|---------|-------------|
| `session list` | List cloud sessions |
| `session list --status active` | Filter by status |
| `session get <id>` | Get session details + live URL |
| `session stop <id>` | Stop session |
| `session stop --all` | Stop all active sessions |
| `session create` | Create new session |
| `session create --profile <id>` | With cloud profile |
| `session create --proxy-country gb` | With geographic proxy |
| `session create --start-url <url>` | Start at specific URL |
| `session create --screen-size 1920x1080` | Custom screen size |
| `session create --keep-alive` | Keep session alive |
| `session create --persist-memory` | Persist memory between tasks |
| `session share <id>` | Create public share URL |
| `session share <id> --delete` | Delete public share |
# List browsers
browser-use cloud v2 GET /browsers
# Create a task
browser-use cloud v2 POST /tasks '{"task":"Search for AI news","url":"https://google.com"}'
# Poll until done
browser-use cloud v2 poll <task-id>
# Remove API key
browser-use cloud logout
```
API key stored in `~/.browser-use/config.json` with `0600` permissions.
## Tunnels
@@ -298,55 +254,70 @@ Expose local dev servers to cloud browsers via Cloudflare tunnels.
# Example: Test local dev server with cloud browser
npm run dev & # localhost:3000
browser-use tunnel 3000 # → https://abc.trycloudflare.com
browser-use -b remote open https://abc.trycloudflare.com
browser-use cloud connect # Provision cloud browser
browser-use open https://abc.trycloudflare.com
```
## Profile Management
### Local Profiles (`-b real`)
| Command | Description |
|---------|-------------|
| `profile list` | List Chrome profiles |
| `profile cookies <name>` | Show cookies by domain |
| `profile sync --from <name>` | Sync local profile to cloud |
| `profile sync --from Default --domain youtube.com` | Sync specific domain only |
The `profile` subcommand delegates to the [profile-use](https://github.com/browser-use/profile-use) Go binary, which syncs local browser cookies to Browser-Use cloud.
### Cloud Profiles (`-b remote`)
| Command | Description |
|---------|-------------|
| `profile list` | List cloud profiles |
| `profile list --page 2 --page-size 50` | Pagination |
| `profile get <id>` | Get profile details |
| `profile create` | Create profile |
| `profile create --name "My Profile"` | Create with name |
| `profile update <id> --name <name>` | Rename profile |
| `profile delete <id>` | Delete profile |
## Local Session Management
The binary is managed at `~/.browser-use/bin/profile-use` and auto-downloaded on first use.
| Command | Description |
|---------|-------------|
| `sessions` | List active sessions |
| `close` | Close browser session |
| `profile` | Interactive sync wizard |
| `profile list` | List detected browsers and profiles |
| `profile sync --all` | Sync all profiles to cloud |
| `profile sync --browser "Google Chrome" --profile "Default"` | Sync specific profile |
| `profile auth --apikey <key>` | Set API key (shared with `cloud login`) |
| `profile inspect --browser "Google Chrome" --profile "Default"` | Inspect cookies locally |
| `profile update` | Download/update the profile-use binary |
## Session Management
| Command | Description |
|---------|-------------|
| `sessions` | List active browser sessions |
| `close` | Close current session's browser and daemon |
| `close --all` | Close all sessions |
| `server status` | Check if server is running |
| `server stop` | Stop server |
| `server logs` | View server logs |
| `--session NAME` | Target a named session (default: "default") |
```bash
# Default behavior unchanged
browser-use open https://example.com # uses session 'default'
browser-use state # talks to 'default' daemon
# Named sessions
browser-use --session work open https://example.com
browser-use --session work state
browser-use --session cloud cloud connect
# List active sessions
browser-use sessions
# Close specific session
browser-use --session work close
# Close all sessions
browser-use close --all
# Env var fallback
BROWSER_USE_SESSION=work browser-use state
```
## Global Options
| Option | Description |
|--------|-------------|
| `--session NAME` | Use named session (default: "default") |
| `--browser MODE` | Browser mode: chromium, real, remote |
| `--headed` | Show browser window |
| `--profile NAME` | Browser profile (local name or cloud ID) |
| `--profile [NAME]` | Use real Chrome (bare `--profile` uses "Default") |
| `--connect` | Auto-discover and connect to running Chrome via CDP |
| `--cdp-url <url>` | Connect to existing browser via CDP URL (`http://` or `ws://`) |
| `--session NAME` | Target a named session (default: "default", env: `BROWSER_USE_SESSION`) |
| `--json` | Output as JSON |
| `--api-key KEY` | Override API key |
| `--mcp` | Run as MCP server via stdin/stdout |
**Session behavior**: All commands without `--session` use the same "default" session. The browser stays open and is reused across commands. Use `--session NAME` to run multiple browsers in parallel.
## Examples
### Fill a Form
@@ -365,15 +336,6 @@ browser-use open https://news.ycombinator.com
browser-use eval "Array.from(document.querySelectorAll('.titleline a')).slice(0,5).map(a => a.textContent)"
```
### Multi-Session Workflow
```bash
browser-use --session work open https://work.example.com
browser-use --session personal open https://personal.example.com
browser-use --session work state
browser-use --session personal state
browser-use close --all
```
### Python Automation
```bash
browser-use open https://example.com
@@ -385,19 +347,6 @@ browser.screenshot('scrolled.png')
"
```
### Cloud Agent with Session Reuse
```bash
# Start task, keep session alive
browser-use -b remote run "Log into example.com" --keep-alive --no-wait
# → task_id: task-123, session_id: sess-456
# Check task status
browser-use task status task-123
# Run another task in same session (preserves login)
browser-use -b remote run "Go to settings" --session-id sess-456
```
## Claude Code Skill
For [Claude Code](https://claude.ai/code), a skill provides richer context for browser automation:
@@ -410,15 +359,34 @@ curl -o ~/.claude/skills/browser-use/SKILL.md \
## How It Works
The CLI uses a session server architecture:
The CLI uses a multi-session daemon architecture:
1. First command starts a background server (browser stays open)
1. First command starts a background daemon for that session (browser stays open)
2. Subsequent commands communicate via Unix socket (or TCP on Windows)
3. Browser persists across commands for fast interaction
4. Server auto-starts when needed, stops with `browser-use server stop`
4. Each `--session` gets its own daemon, socket, and PID file in `~/.browser-use/`
5. Daemon auto-starts when needed, auto-exits when browser dies, or stops with `browser-use close`
This gives you ~50ms command latency instead of waiting for browser startup each time.
### File Layout
All CLI-managed files live under `~/.browser-use/` (override with `BROWSER_USE_HOME`):
```
~/.browser-use/
├── config.json # API key, settings (shared with profile-use)
├── bin/
│ └── profile-use # Managed Go binary (auto-downloaded)
├── tunnels/
│ ├── {port}.json # Tunnel metadata
│ └── {port}.log # Tunnel logs
├── default.state.json # Daemon lifecycle state (phase, PID, config)
├── default.sock # Daemon socket (ephemeral)
├── default.pid # Daemon PID (ephemeral)
└── cli.log # Daemon log
```
<details>
<summary>Windows Troubleshooting</summary>
@@ -444,11 +412,11 @@ echo $env:PATH
& "C:\Program Files\Git\bin\bash.exe" -c 'browser-use --help'
```
### "Failed to start session server" error
### "Failed to start daemon" error
Kill zombie processes:
```powershell
# Find process on port
netstat -ano | findstr 49698
# Find browser-use Python processes
tasklist | findstr python
# Kill by PID
taskkill /PID <pid> /F

View File

@@ -1,14 +1,13 @@
"""Browser-use CLI package.
This package provides a fast command-line interface for browser automation.
The CLI uses a session server architecture for persistent browser sessions.
The CLI uses a daemon architecture for persistent browser sessions.
Usage:
browser-use open https://example.com
browser-use click 5
browser-use type "Hello World"
browser-use python "print(browser.url)"
browser-use run "Fill the contact form"
browser-use close
"""

View File

@@ -0,0 +1,201 @@
"""Direct action execution for CLI daemon — no event bus dispatch.
Wraps DefaultActionWatchdog methods and DomService for direct calling.
The watchdog instance is NOT registered on the event bus — it's just
used as a library of action implementations.
"""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any
from bubus import EventBus
from browser_use.browser.events import (
GoBackEvent,
SelectDropdownOptionEvent,
SendKeysEvent,
TypeTextEvent,
UploadFileEvent,
)
from browser_use.browser.watchdogs.default_action_watchdog import DefaultActionWatchdog
from browser_use.dom.service import DomService
from browser_use.dom.views import EnhancedDOMTreeNode, SerializedDOMState
if TYPE_CHECKING:
from browser_use.browser.session import BrowserSession
from browser_use.browser.views import BrowserStateSummary, PageInfo
logger = logging.getLogger('browser_use.skill_cli.actions')
class ActionHandler:
"""Execute browser actions directly without the event bus.
Uses DefaultActionWatchdog methods for complex actions (click, type, keys, etc.)
and DomService for DOM snapshots. All other actions use direct CDP calls.
"""
def __init__(self, browser_session: BrowserSession) -> None:
self.bs = browser_session
# Create watchdog instance — NOT registered on event bus
self._watchdog = DefaultActionWatchdog(
event_bus=EventBus(), # dummy, never dispatched to
browser_session=browser_session,
)
self._dom_service: DomService | None = None
async def navigate(self, url: str) -> None:
"""Navigate the focused tab to a URL."""
assert self.bs.agent_focus_target_id is not None, 'No focused tab'
await self.bs._navigate_and_wait(url, self.bs.agent_focus_target_id)
async def click_element(self, node: EnhancedDOMTreeNode) -> dict[str, Any] | None:
"""Click an element using the watchdog's full implementation (with fallbacks)."""
return await self._watchdog._click_element_node_impl(node)
async def click_coordinate(self, x: int, y: int) -> dict[str, Any] | None:
"""Click at coordinates."""
from browser_use.browser.events import ClickCoordinateEvent
event = ClickCoordinateEvent(coordinate_x=x, coordinate_y=y)
return await self._watchdog.on_ClickCoordinateEvent(event)
async def type_text(self, node: EnhancedDOMTreeNode, text: str) -> dict[str, Any] | None:
"""Type text into an element."""
event = TypeTextEvent(node=node, text=text)
return await self._watchdog.on_TypeTextEvent(event)
async def scroll(self, direction: str, amount: int) -> None:
"""Scroll the page using JS (CDP gesture doesn't work in --connect mode)."""
if direction in ('down', 'up'):
x, y = 0, (amount if direction == 'down' else -amount)
else:
x, y = (amount if direction == 'right' else -amount), 0
cdp_session = await self.bs.get_or_create_cdp_session()
assert cdp_session is not None, 'No CDP session for scroll'
await cdp_session.cdp_client.send.Runtime.evaluate(
params={'expression': f'window.scrollBy({x}, {y})', 'awaitPromise': False},
session_id=cdp_session.session_id,
)
async def go_back(self) -> None:
"""Go back in history."""
event = GoBackEvent()
await self._watchdog.on_GoBackEvent(event)
async def send_keys(self, keys: str) -> None:
"""Send keyboard keys."""
event = SendKeysEvent(keys=keys)
await self._watchdog.on_SendKeysEvent(event)
async def select_dropdown(self, node: EnhancedDOMTreeNode, text: str) -> dict[str, str]:
"""Select a dropdown option."""
event = SelectDropdownOptionEvent(node=node, text=text)
return await self._watchdog.on_SelectDropdownOptionEvent(event)
async def upload_file(self, node: EnhancedDOMTreeNode, file_path: str) -> None:
"""Upload a file to a file input element."""
event = UploadFileEvent(node=node, file_path=file_path)
await self._watchdog.on_UploadFileEvent(event)
async def get_state(self) -> BrowserStateSummary:
"""Build DOM via DomService directly (no DOMWatchdog, no event bus)."""
from browser_use.browser.views import BrowserStateSummary, PageInfo
if self._dom_service is None:
self._dom_service = DomService(browser_session=self.bs)
page_url = await self.bs.get_current_page_url()
# Fast path for non-http pages
if page_url.lower().split(':', 1)[0] not in ('http', 'https'):
return BrowserStateSummary(
dom_state=SerializedDOMState(_root=None, selector_map={}),
url=page_url,
title='Empty Tab',
tabs=await self.bs.get_tabs(),
screenshot=None,
page_info=None,
)
# Build DOM and take screenshot in parallel
import asyncio
dom_task = asyncio.create_task(self._dom_service.get_serialized_dom_tree())
screenshot_task = asyncio.create_task(self.bs.take_screenshot())
dom_state: SerializedDOMState | None = None
screenshot_b64: str | None = None
try:
dom_state, _tree, _timing = await dom_task
except Exception as e:
logger.warning(f'DOM build failed: {e}')
dom_state = SerializedDOMState(_root=None, selector_map={})
try:
screenshot_bytes = await screenshot_task
import base64
screenshot_b64 = base64.b64encode(screenshot_bytes).decode() if screenshot_bytes else None
except Exception as e:
logger.warning(f'Screenshot failed: {e}')
# Update cached selector map for element lookups
if dom_state and dom_state.selector_map:
self.bs.update_cached_selector_map(dom_state.selector_map)
# Get page info
page_info: PageInfo | None = None
try:
cdp_session = await self.bs.get_or_create_cdp_session(target_id=None, focus=False)
if cdp_session:
metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
css_metrics = metrics.get('cssLayoutViewport', {})
content_size = metrics.get('cssContentSize', metrics.get('contentSize', {}))
visual_viewport = metrics.get('cssVisualViewport', metrics.get('visualViewport', {}))
page_info = PageInfo(
viewport_width=int(css_metrics.get('clientWidth', 0)),
viewport_height=int(css_metrics.get('clientHeight', 0)),
page_width=int(content_size.get('width', 0)),
page_height=int(content_size.get('height', 0)),
scroll_x=int(visual_viewport.get('pageX', 0)),
scroll_y=int(visual_viewport.get('pageY', 0)),
pixels_above=int(visual_viewport.get('pageY', 0)),
pixels_below=max(
0,
int(content_size.get('height', 0))
- int(css_metrics.get('clientHeight', 0))
- int(visual_viewport.get('pageY', 0)),
),
pixels_left=0,
pixels_right=0,
)
except Exception as e:
logger.debug(f'Failed to get page info: {e}')
tabs = await self.bs.get_tabs()
# Use focused tab's title, not tabs[0]
title = ''
focused_id = self.bs.agent_focus_target_id
found_focused = False
for tab in tabs:
if tab.target_id == focused_id:
title = tab.title
found_focused = True
break
if not found_focused and tabs:
title = tabs[0].title
return BrowserStateSummary(
dom_state=dom_state,
url=page_url,
title=title,
tabs=tabs,
screenshot=screenshot_b64,
page_info=page_info,
closed_popup_messages=self.bs._closed_popup_messages.copy(),
)

View File

@@ -1,167 +0,0 @@
"""API key management for browser-use CLI."""
import json
import os
import sys
from pathlib import Path
class APIKeyRequired(Exception):
"""Raised when API key is required but not provided."""
pass
def get_config_path() -> Path:
"""Get browser-use config file path."""
if sys.platform == 'win32':
base = Path(os.environ.get('APPDATA', Path.home()))
else:
base = Path(os.environ.get('XDG_CONFIG_HOME', Path.home() / '.config'))
return base / 'browser-use' / 'config.json'
def require_api_key(feature: str = 'this feature') -> str:
"""Get API key or raise helpful error.
Checks in order:
1. BROWSER_USE_API_KEY environment variable
2. Config file (~/.config/browser-use/config.json)
3. Interactive prompt (if TTY)
4. Raises APIKeyRequired with helpful message
"""
# 1. Check environment
key = os.environ.get('BROWSER_USE_API_KEY')
if key:
return key
# 2. Check config file
config_path = get_config_path()
if config_path.exists():
try:
config = json.loads(config_path.read_text())
if key := config.get('api_key'):
return key
except Exception:
pass
# 3. Interactive prompt (if TTY)
if sys.stdin.isatty() and sys.stdout.isatty():
return prompt_for_api_key(feature)
# 4. Error with helpful message
raise APIKeyRequired(
f"""
╭─────────────────────────────────────────────────────────────╮
│ 🔑 Browser-Use API Key Required │
│ │
{feature} requires an API key. │
│ │
│ Get yours at: https://browser-use.com/new-api-key │
│ │
│ Then set it via: │
│ export BROWSER_USE_API_KEY=your_key_here │
│ │
│ Or add to {config_path}: │
{{"api_key": "your_key_here"}}
╰─────────────────────────────────────────────────────────────╯
"""
)
def prompt_for_api_key(feature: str) -> str:
"""Interactive prompt for API key."""
print(
f"""
╭─────────────────────────────────────────────────────────────╮
│ 🔑 Browser-Use API Key Required │
│ │
{feature} requires an API key. │
│ Get yours at: https://browser-use.com/new-api-key │
╰─────────────────────────────────────────────────────────────╯
"""
)
try:
key = input('Enter API key: ').strip()
except (EOFError, KeyboardInterrupt):
raise APIKeyRequired('No API key provided')
if not key:
raise APIKeyRequired('No API key provided')
try:
save = input('Save to config? [y/N]: ').strip().lower()
if save == 'y':
save_api_key(key)
except (EOFError, KeyboardInterrupt):
pass
return key
def save_api_key(key: str) -> None:
"""Save API key to config file."""
config_path = get_config_path()
config_path.parent.mkdir(parents=True, exist_ok=True)
config: dict = {}
if config_path.exists():
try:
config = json.loads(config_path.read_text())
except Exception:
pass
config['api_key'] = key
config_path.write_text(json.dumps(config, indent=2))
# Restrict permissions to owner only (0600)
config_path.chmod(0o600)
print(f'Saved to {config_path}')
def get_api_key() -> str | None:
"""Get API key if available, without raising error."""
try:
return require_api_key('API key check')
except APIKeyRequired:
return None
def check_api_key() -> dict[str, bool | str | None]:
"""Check API key availability without interactive prompts.
Returns:
Dict with keys:
- 'available': bool - whether API key is configured
- 'source': str | None - where it came from ('env', 'config', or None)
- 'key_prefix': str | None - first 8 chars of key (for display)
"""
# Check environment
key = os.environ.get('BROWSER_USE_API_KEY')
if key:
return {
'available': True,
'source': 'env',
'key_prefix': key[:8] if len(key) >= 8 else key,
}
# Check config file
config_path = get_config_path()
if config_path.exists():
try:
config = json.loads(config_path.read_text())
if key := config.get('api_key'):
return {
'available': True,
'source': 'config',
'key_prefix': key[:8] if len(key) >= 8 else key,
}
except Exception:
pass
# Not available
return {
'available': False,
'source': None,
'key_prefix': None,
}

View File

@@ -0,0 +1,225 @@
"""Lightweight BrowserSession subclass for the CLI daemon.
Skips watchdogs, event bus handlers, and auto-reconnect for ALL modes.
Launches browser if needed, then calls connect() directly.
All inherited methods (get_element_by_index, take_screenshot, etc.)
work because this IS a BrowserSession.
"""
from __future__ import annotations
import logging
import psutil
from browser_use.browser.session import BrowserSession
logger = logging.getLogger('browser_use.skill_cli.browser')
class CLIBrowserSession(BrowserSession):
"""BrowserSession that skips watchdogs and event bus for all modes.
For --connect: connects to existing Chrome via CDP URL.
For managed Chromium: launches browser, gets CDP URL, connects.
For cloud: provisions browser, gets CDP URL, connects.
All three modes converge at connect() — no watchdogs, no event bus.
"""
_browser_process: psutil.Process | None = None # type: ignore[assignment]
async def start(self) -> None:
"""Launch/provision browser if needed, then connect lightweight."""
if self.cdp_url:
# --connect or --cdp-url: CDP URL already known
pass
elif self.browser_profile.use_cloud:
# Cloud: provision browser via API
await self._provision_cloud_browser()
else:
# Managed Chromium: launch browser process
await self._launch_local_browser()
# All modes: lightweight CDP connection (no watchdogs)
await self.connect()
# Prevent heavy monitoring on future tabs
if self.session_manager:
async def _noop(cdp_session: object) -> None:
pass
self.session_manager._enable_page_monitoring = _noop # type: ignore[assignment]
# Disable auto-reconnect — daemon should die when CDP drops
self._intentional_stop = True
# Register popup/dialog handler so JS alerts don't freeze Chrome
await self._register_dialog_handler()
async def _register_dialog_handler(self) -> None:
"""Register CDP handler to auto-dismiss JS dialogs (alert, confirm, prompt).
Without this, any JS dialog freezes all CDP commands until manually dismissed.
Messages are stored in _closed_popup_messages for inclusion in state output.
"""
import asyncio as _asyncio
if not self._cdp_client_root:
return
async def handle_dialog(event_data: dict, session_id: str | None = None) -> None:
try:
dialog_type = event_data.get('type', 'alert')
message = event_data.get('message', '')
if message:
self._closed_popup_messages.append(f'[{dialog_type}] {message}')
# Accept alerts/confirms/beforeunload, dismiss prompts
should_accept = dialog_type in ('alert', 'confirm', 'beforeunload')
logger.info(f'Auto-{"accepting" if should_accept else "dismissing"} {dialog_type}: {message[:100]}')
if not self._cdp_client_root:
return
await _asyncio.wait_for(
self._cdp_client_root.send.Page.handleJavaScriptDialog(
params={'accept': should_accept},
session_id=session_id,
),
timeout=0.5,
)
except Exception:
pass
# Try to enable Page domain on root client (may fail — not all CDP targets support it)
try:
await self._cdp_client_root.send.Page.enable()
except Exception:
pass
self._cdp_client_root.register.Page.javascriptDialogOpening(handle_dialog) # type: ignore[arg-type]
async def _launch_local_browser(self) -> None:
"""Launch Chromium using LocalBrowserWatchdog's launch logic."""
from bubus import EventBus
from browser_use.browser.watchdogs.local_browser_watchdog import LocalBrowserWatchdog
# Instantiate watchdog as plain object — NOT registered on event bus
launcher = LocalBrowserWatchdog(event_bus=EventBus(), browser_session=self)
process, cdp_url = await launcher._launch_browser()
self._browser_process = process
self.browser_profile.cdp_url = cdp_url
logger.info(f'Launched browser (PID {process.pid}), CDP: {cdp_url}')
async def _provision_cloud_browser(self) -> None:
"""Provision a cloud browser and set the CDP URL."""
import os
from browser_use.browser.cloud.views import CreateBrowserRequest
# Override cloud API base URL if set (CLI injects this into daemon env).
# CloudBrowserClient expects the host URL (it appends /api/v2/... internally).
cloud_base = os.environ.get('BROWSER_USE_CLOUD_BASE_URL')
if cloud_base:
self._cloud_browser_client.api_base_url = cloud_base.rstrip('/')
# Ensure CLI has an API key from config.json before proceeding.
from browser_use.skill_cli.config import get_config_value
if not get_config_value('api_key'):
from browser_use.browser.cloud.views import CloudBrowserAuthError
raise CloudBrowserAuthError(
'No API key configured. Run `browser-use cloud login <key>` or `browser-use cloud signup`.'
)
cloud_params = self.browser_profile.cloud_browser_params or CreateBrowserRequest()
# Set recording from CLI config (defaults to True)
from browser_use.skill_cli.config import get_config_value
cloud_params.enable_recording = bool(get_config_value('cloud_connect_recording'))
try:
cloud_response = await self._cloud_browser_client.create_browser(cloud_params)
except Exception as e:
# If profile is invalid, create a new one and retry once
if 'profile' in str(e).lower() or '422' in str(e):
logger.info('Cloud profile invalid, creating new one and retrying')
from browser_use.skill_cli.commands.cloud import _create_cloud_profile_inner
api_key = get_config_value('api_key')
if not api_key:
raise
new_profile_id = _create_cloud_profile_inner(str(api_key))
cloud_params.profile_id = new_profile_id
cloud_response = await self._cloud_browser_client.create_browser(cloud_params)
else:
raise
self.browser_profile.cdp_url = cloud_response.cdpUrl
self.browser_profile.is_local = False
logger.info(f'Cloud browser provisioned, CDP: {cloud_response.cdpUrl}')
async def stop(self) -> None:
"""Disconnect from the browser.
For --connect/--cdp-url: just close the websocket (we don't own the browser).
For cloud: stop the remote browser via API before disconnecting.
"""
self._intentional_stop = True
# Stop cloud browser if we provisioned one
if self.browser_profile.use_cloud and self._cloud_browser_client.current_session_id:
try:
import asyncio as _asyncio
await _asyncio.wait_for(self._cloud_browser_client.stop_browser(), timeout=5.0)
except Exception as e:
logger.debug(f'Error stopping cloud browser: {e}')
if self._cdp_client_root:
try:
await self._cdp_client_root.stop()
except Exception as e:
logger.debug(f'Error closing CDP client: {e}')
self._cdp_client_root = None # type: ignore[assignment]
if self.session_manager:
try:
await self.session_manager.clear()
except Exception as e:
logger.debug(f'Error clearing session manager: {e}')
self.session_manager = None
self.agent_focus_target_id = None
self._cached_selector_map.clear()
async def kill(self) -> None:
"""Send Browser.close to kill the browser, then disconnect.
For managed Chromium: sends Browser.close CDP command + terminates process.
"""
if self._cdp_client_root:
try:
await self._cdp_client_root.send.Browser.close()
except Exception:
pass
await self.stop()
# Force kill the process if we launched it and it's still alive
if self._browser_process:
try:
if self._browser_process.is_running():
self._browser_process.terminate()
self._browser_process.wait(timeout=5)
except Exception:
try:
self._browser_process.kill()
except Exception:
pass
self._browser_process = None
@property
def is_cdp_connected(self) -> bool:
"""Check if CDP WebSocket connection is alive."""
if self._cdp_client_root is None or self._cdp_client_root.ws is None:
return False
try:
from websockets.protocol import State
return self._cdp_client_root.ws.state is State.OPEN
except Exception:
return False

View File

@@ -1,23 +1,15 @@
"""Command handlers for browser-use CLI."""
from browser_use.skill_cli.commands import (
agent,
browser,
cloud_session,
cloud_task,
doctor,
python_exec,
session,
setup,
)
__all__ = [
'agent',
'browser',
'cloud_session',
'cloud_task',
'doctor',
'python_exec',
'session',
'setup',
]

View File

@@ -1,335 +0,0 @@
"""Agent task command handler."""
import logging
import os
from typing import Any
from browser_use.skill_cli.api_key import APIKeyRequired, require_api_key
from browser_use.skill_cli.sessions import SessionInfo
logger = logging.getLogger(__name__)
# Cloud-only flags that only work in remote mode
CLOUD_ONLY_FLAGS = [
'session_id',
'proxy_country',
'wait',
'stream',
'flash',
'keep_alive',
'thinking',
'start_url',
'metadata',
'secret',
'allowed_domain',
'skill_id',
'structured_output',
'judge',
'judge_ground_truth',
]
async def handle(session: SessionInfo, params: dict[str, Any]) -> Any:
"""Handle agent run command.
Routes based on browser mode:
- Remote mode (--browser remote): Uses Cloud API with US proxy by default
- Local mode (default): Uses local browser-use agent
"""
task = params.get('task')
if not task:
return {'success': False, 'error': 'No task provided'}
# Route based on browser mode
if session.browser_mode == 'remote':
# Remote mode requires Browser-Use API key
try:
require_api_key('Cloud agent tasks')
except APIKeyRequired as e:
return {'success': False, 'error': str(e)}
return await _handle_cloud_task(params)
else:
# Check if user tried to use cloud-only flags in local mode
used_cloud_flags = [f for f in CLOUD_ONLY_FLAGS if params.get(f)]
if used_cloud_flags:
from browser_use.skill_cli.install_config import is_mode_available
flags_str = ', '.join(f'--{f.replace("_", "-")}' for f in used_cloud_flags)
if is_mode_available('remote'):
# Remote is available, user just needs to use it
return {
'success': False,
'error': f'Cloud-only flags used in local mode: {flags_str}\nUse --browser remote to enable cloud features.',
}
else:
# Remote not installed (--local-only install)
return {
'success': False,
'error': f'Cloud-only flags require remote mode: {flags_str}\n'
f'Remote mode is not installed. Reinstall to enable:\n'
f' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only\n'
f' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full',
}
return await _handle_local_task(session, params)
async def _handle_cloud_task(params: dict[str, Any]) -> Any:
"""Handle task execution via Cloud API.
By default uses US proxy for all cloud tasks.
"""
from browser_use.skill_cli.commands import cloud_session, cloud_task
task = params['task']
# Handle vision flag (--vision vs --no-vision)
vision: bool | None = None
if params.get('vision'):
vision = True
elif params.get('no_vision'):
vision = False
# Parse key=value list params
metadata = _parse_key_value_list(params.get('metadata'))
secrets = _parse_key_value_list(params.get('secret'))
# Build session params - only include what user explicitly set
session_id = params.get('session_id')
profile_id = params.get('profile')
proxy_country = params.get('proxy_country')
try:
logger.info(f'Creating cloud task: {task}')
# Create session first if profile or proxy specified and no session_id
if (profile_id or proxy_country) and not session_id:
session = cloud_session.create_session(
profile_id=profile_id,
proxy_country=proxy_country,
keep_alive=params.get('keep_alive'),
)
session_id = session.id
logger.info(f'Created cloud session: {session_id}')
# Create cloud task - only pass what user explicitly set
task_response = cloud_task.create_task(
task=task,
llm=params.get('llm'),
session_id=session_id,
max_steps=params.get('max_steps'),
flash_mode=params.get('flash'),
thinking=params.get('thinking'),
vision=vision,
start_url=params.get('start_url'),
metadata=metadata,
secrets=secrets,
allowed_domains=params.get('allowed_domain'),
skill_ids=params.get('skill_id'),
structured_output=params.get('structured_output'),
judge=params.get('judge'),
judge_ground_truth=params.get('judge_ground_truth'),
)
task_id = task_response.id
response_session_id = task_response.session_id
if not task_id:
return {
'success': False,
'error': 'Cloud API did not return a task ID',
'task': task,
}
logger.info(f'Cloud task created: {task_id}')
# Return immediately unless --wait is specified
if not params.get('wait'):
return {
'success': True,
'task_id': task_id,
'session_id': response_session_id,
'message': 'Task started. Use "browser-use task status <task_id>" to check progress.',
}
# Poll until complete
logger.info('Waiting for task completion...')
result = await cloud_task.poll_until_complete(task_id, stream=params.get('stream', False))
return {
'success': True,
'task': task,
'task_id': task_id,
'session_id': response_session_id,
'status': result.status,
'output': result.output,
'cost': result.cost,
'done': result.status == 'finished',
}
except Exception as e:
logger.exception(f'Cloud task failed: {e}')
return {
'success': False,
'error': str(e),
'task': task,
}
def _parse_key_value_list(items: list[str] | None) -> dict[str, str | None] | None:
"""Parse a list of 'key=value' strings into a dict."""
if not items:
return None
result: dict[str, str | None] = {}
for item in items:
if '=' in item:
key, value = item.split('=', 1)
result[key] = value
return result if result else None
async def _handle_local_task(session: SessionInfo, params: dict[str, Any]) -> Any:
"""Handle task execution locally with browser-use agent."""
task = params['task']
max_steps = params.get('max_steps')
model = params.get('llm') # Optional model override
try:
# Import agent and LLM
from browser_use.agent.service import Agent
# Try to get LLM from environment (with optional model override)
llm = await get_llm(model=model)
if llm is None:
if model:
return {
'success': False,
'error': f'Could not initialize model "{model}". '
f'Make sure the appropriate API key is set (OPENAI_API_KEY, ANTHROPIC_API_KEY, or GOOGLE_API_KEY).',
}
return {
'success': False,
'error': 'No LLM configured. Set BROWSER_USE_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, or GOOGLE_API_KEY',
}
# Create and run agent
agent = Agent(
task=task,
llm=llm,
browser_session=session.browser_session,
)
logger.info(f'Running local agent task: {task}')
run_kwargs = {}
if max_steps is not None:
run_kwargs['max_steps'] = max_steps
result = await agent.run(**run_kwargs)
# Extract result info
final_result = result.final_result() if result else None
return {
'success': True,
'task': task,
'steps': len(result) if result else 0,
'result': str(final_result) if final_result else None,
'done': result.is_done() if result else False,
}
except Exception as e:
logger.exception(f'Local agent task failed: {e}')
return {
'success': False,
'error': str(e),
'task': task,
}
def _get_verified_models() -> dict[str, set[str]]:
"""Extract verified model names from SDK sources of truth."""
import typing
from anthropic.types.model_param import ModelParam
from openai.types.shared.chat_model import ChatModel
from browser_use.llm.google.chat import VerifiedGeminiModels
# OpenAI: ChatModel is a Literal type
openai_models = set(typing.get_args(ChatModel))
# Anthropic: ModelParam is Union[Literal[...], str] - extract the Literal
anthropic_literal = typing.get_args(ModelParam)[0]
anthropic_models = set(typing.get_args(anthropic_literal))
# Google: VerifiedGeminiModels Literal
google_models = set(typing.get_args(VerifiedGeminiModels))
# Browser-Use: cloud models
browser_use_models = {'bu-latest', 'bu-1-0', 'bu-2-0'}
return {
'openai': openai_models,
'anthropic': anthropic_models,
'google': google_models,
'browser-use': browser_use_models,
}
_VERIFIED_MODELS: dict[str, set[str]] | None = None
def _get_provider_for_model(model: str) -> str | None:
"""Determine the provider by checking SDK verified model lists."""
global _VERIFIED_MODELS
if _VERIFIED_MODELS is None:
_VERIFIED_MODELS = _get_verified_models()
for provider, models in _VERIFIED_MODELS.items():
if model in models:
return provider
return None
def get_llm(model: str | None = None) -> Any:
"""Get LLM instance from environment configuration.
Args:
model: Optional model name to use. If provided, will instantiate
the appropriate provider for that model. If not provided,
auto-detects from available API keys.
Supported providers: OpenAI, Anthropic, Google, Browser-Use.
Model names are validated against each SDK's verified model list.
"""
from browser_use.llm import ChatAnthropic, ChatBrowserUse, ChatGoogle, ChatOpenAI
if model:
provider = _get_provider_for_model(model)
if provider == 'openai':
return ChatOpenAI(model=model)
elif provider == 'anthropic':
return ChatAnthropic(model=model)
elif provider == 'google':
return ChatGoogle(model=model)
elif provider == 'browser-use':
return ChatBrowserUse(model=model)
else:
logger.warning(f'Unknown model: {model}. Not in any verified model list.')
return None
# No model specified - auto-detect from available API keys
if os.environ.get('BROWSER_USE_API_KEY'):
return ChatBrowserUse()
if os.environ.get('OPENAI_API_KEY'):
return ChatOpenAI(model='o3')
if os.environ.get('ANTHROPIC_API_KEY'):
return ChatAnthropic(model='claude-sonnet-4-0')
if os.environ.get('GOOGLE_API_KEY'):
return ChatGoogle(model='gemini-flash-latest')
return None

View File

@@ -19,10 +19,10 @@ COMMANDS = {
'back',
'screenshot',
'state',
'switch',
'close-tab',
'tab',
'keys',
'select',
'upload',
'eval',
'extract',
'cookies',
@@ -81,18 +81,16 @@ async def _get_element_center(session: SessionInfo, node: Any) -> tuple[float, f
async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> Any:
"""Handle browser control command."""
bs = session.browser_session
actions = session.actions
if actions is None:
return {'error': 'ActionHandler not initialized'}
if action == 'open':
url = params['url']
# Ensure URL has scheme
if not url.startswith(('http://', 'https://', 'file://')):
url = 'https://' + url
from browser_use.browser.events import NavigateToUrlEvent
await bs.event_bus.dispatch(NavigateToUrlEvent(url=url))
await actions.navigate(url)
result: dict[str, Any] = {'url': url}
# Add live preview URL for cloud browsers
if bs.browser_profile.use_cloud and bs.cdp_url:
from urllib.parse import quote
@@ -100,18 +98,22 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
return result
elif action == 'click':
from browser_use.browser.events import ClickElementEvent
index = params['index']
# Look up node from selector map
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
await bs.event_bus.dispatch(ClickElementEvent(node=node))
return {'clicked': index}
args = params.get('args', [])
if len(args) == 2:
x, y = args
await actions.click_coordinate(x, y)
return {'clicked_coordinate': {'x': x, 'y': y}}
elif len(args) == 1:
index = args[0]
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
await actions.click_element(node)
return {'clicked': index}
else:
return {'error': 'Usage: click <index> or click <x> <y>'}
elif action == 'type':
# Type into currently focused element using CDP directly
text = params['text']
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if not cdp_session:
@@ -123,30 +125,23 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
return {'typed': text}
elif action == 'input':
from browser_use.browser.events import ClickElementEvent, TypeTextEvent
index = params['index']
text = params['text']
# Look up node from selector map
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
await bs.event_bus.dispatch(ClickElementEvent(node=node))
await bs.event_bus.dispatch(TypeTextEvent(node=node, text=text))
await actions.click_element(node)
await actions.type_text(node, text)
return {'input': text, 'element': index}
elif action == 'scroll':
from browser_use.browser.events import ScrollEvent
direction = params.get('direction', 'down')
amount = params.get('amount', 500)
await bs.event_bus.dispatch(ScrollEvent(direction=direction, amount=amount))
await actions.scroll(direction, amount)
return {'scrolled': direction, 'amount': amount}
elif action == 'back':
from browser_use.browser.events import GoBackEvent
await bs.event_bus.dispatch(GoBackEvent())
await actions.go_back()
return {'back': True}
elif action == 'screenshot':
@@ -161,59 +156,133 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
return {'screenshot': base64.b64encode(data).decode(), 'size': len(data)}
elif action == 'state':
# Return the same LLM representation that browser-use agents see
state_text = await bs.get_state_as_text()
state = await actions.get_state()
assert state.dom_state is not None
state_text = state.dom_state.llm_representation()
# Prepend viewport dimensions
if state.page_info:
pi = state.page_info
viewport_text = f'viewport: {pi.viewport_width}x{pi.viewport_height}\n'
viewport_text += f'page: {pi.page_width}x{pi.page_height}\n'
viewport_text += f'scroll: ({pi.scroll_x}, {pi.scroll_y})\n'
state_text = viewport_text + state_text
# Append auto-dismissed popup messages
if bs._closed_popup_messages:
state_text += '\nAuto-closed dialogs:\n'
for msg in bs._closed_popup_messages:
state_text += f' {msg}\n'
bs._closed_popup_messages.clear()
return {'_raw_text': state_text}
elif action == 'switch':
from browser_use.browser.events import SwitchTabEvent
elif action == 'tab':
tab_command = params.get('tab_command')
tab_index = params['tab']
# Get target_id from tab index
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
if tab_index < 0 or tab_index >= len(page_targets):
return {'error': f'Invalid tab index {tab_index}. Available: 0-{len(page_targets) - 1}'}
target_id = page_targets[tab_index].target_id
await bs.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
return {'switched': tab_index}
if tab_command == 'list':
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
lines = ['TAB URL']
for i, t in enumerate(page_targets):
lines.append(f'{i:<4} {t.url}')
return {'_raw_text': '\n'.join(lines)}
elif action == 'close-tab':
from browser_use.browser.events import CloseTabEvent
elif tab_command == 'new':
url = params.get('url', 'about:blank')
target_id = await bs._cdp_create_new_page(url, background=True)
bs.agent_focus_target_id = target_id
return {'created': target_id[:8], 'url': url}
tab_index = params.get('tab')
# Get target_id from tab index
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
if tab_index is not None:
elif tab_command == 'switch':
tab_index = params['tab']
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
if tab_index < 0 or tab_index >= len(page_targets):
return {'error': f'Invalid tab index {tab_index}. Available: 0-{len(page_targets) - 1}'}
target_id = page_targets[tab_index].target_id
else:
# Close current/focused tab
target_id = bs.session_manager.get_focused_target().target_id if bs.session_manager else None
if not target_id:
return {'error': 'No focused tab to close'}
await bs.event_bus.dispatch(CloseTabEvent(target_id=target_id))
return {'closed': tab_index}
bs.agent_focus_target_id = page_targets[tab_index].target_id
return {'switched': tab_index}
elif tab_command == 'close':
tab_indices = params.get('tabs', [])
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
async def _close_target(tid: str) -> None:
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if cdp_session:
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': tid})
if not tab_indices:
# Use caller's logical focus, not Chrome's global focus
target_id = bs.agent_focus_target_id
if not target_id:
target_id = bs.session_manager.get_focused_target().target_id if bs.session_manager else None
if not target_id:
return {'error': 'No focused tab to close'}
await _close_target(target_id)
return {'closed': [0]}
closed = []
errors = []
for idx in sorted(tab_indices, reverse=True):
if idx < 0 or idx >= len(page_targets):
errors.append(f'Tab {idx} out of range')
continue
try:
await _close_target(page_targets[idx].target_id)
closed.append(idx)
except Exception as e:
errors.append(f'Tab {idx}: {e}')
result: dict[str, Any] = {'closed': closed}
if errors:
result['errors'] = errors
return result
return {'error': 'Invalid tab command. Use: list, new, switch, close'}
elif action == 'keys':
from browser_use.browser.events import SendKeysEvent
keys = params['keys']
await bs.event_bus.dispatch(SendKeysEvent(keys=keys))
await actions.send_keys(keys)
return {'sent': keys}
elif action == 'select':
from browser_use.browser.events import SelectDropdownOptionEvent
index = params['index']
value = params['value']
# Look up node from selector map
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
await bs.event_bus.dispatch(SelectDropdownOptionEvent(node=node, text=value))
await actions.select_dropdown(node, value)
return {'selected': value, 'element': index}
elif action == 'upload':
index = params['index']
file_path = params['path']
p = Path(file_path)
if not p.exists():
return {'error': f'File not found: {file_path}'}
if not p.is_file():
return {'error': f'Not a file: {file_path}'}
if p.stat().st_size == 0:
return {'error': f'File is empty (0 bytes): {file_path}'}
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
file_input_node = bs.find_file_input_near_element(node)
if file_input_node is None:
selector_map = await bs.get_selector_map()
file_input_indices = [idx for idx, el in selector_map.items() if bs.is_file_input(el)]
if file_input_indices:
hint = f' File input(s) found at index: {", ".join(map(str, file_input_indices))}'
else:
hint = ' No file input found on the page.'
return {'error': f'Element {index} is not a file input.{hint}'}
await actions.upload_file(file_input_node, file_path)
return {'uploaded': file_path, 'element': index}
elif action == 'eval':
js = params['js']
# Execute JavaScript via CDP
@@ -224,7 +293,7 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
query = params['query']
# This requires LLM integration
# For now, return a placeholder
return {'query': query, 'error': 'extract requires agent mode - use: browser-use run "extract ..."'}
return {'query': query, 'error': 'extract is not yet implemented'}
elif action == 'hover':
index = params['index']
@@ -473,7 +542,7 @@ async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> A
]
file_path = Path(params['file'])
file_path.write_text(json.dumps(cookie_list, indent=2))
file_path.write_text(json.dumps(cookie_list, indent=2, ensure_ascii=False), encoding='utf-8')
return {'exported': len(cookie_list), 'file': str(file_path)}
elif cookies_command == 'import':

View File

@@ -0,0 +1,694 @@
"""Cloud API command — generic REST passthrough to Browser-Use Cloud.
Stdlib only. No async, no SDK, no heavy imports.
Usage:
browser-use cloud login <api-key>
browser-use cloud logout
browser-use cloud v2 GET /browsers
browser-use cloud v2 POST /tasks '{"task":"...","url":"https://..."}'
browser-use cloud v2 poll <task-id>
browser-use cloud v2 --help
"""
import json
import os
import sys
import time
import typing
import urllib.error
import urllib.request
from pathlib import Path
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
_DEFAULT_BASE_URL = 'https://api.browser-use.com'
_AUTH_HEADER = 'X-Browser-Use-API-Key'
def _get_base() -> str:
"""Get the API host URL. All paths are appended by callers."""
return os.environ.get('BROWSER_USE_CLOUD_BASE_URL', _DEFAULT_BASE_URL).rstrip('/')
def _base_url(version: str) -> str:
"""Get versioned API URL: {base}/api/{version}"""
per_version = os.environ.get(f'BROWSER_USE_CLOUD_BASE_URL_{version.upper()}')
if per_version:
return per_version
return f'{_get_base()}/api/{version}'
def _spec_url(version: str) -> str:
per_version = os.environ.get(f'BROWSER_USE_OPENAPI_SPEC_URL_{version.upper()}')
if per_version:
return per_version
return f'{_get_base()}/api/{version}/openapi.json'
# ---------------------------------------------------------------------------
# API key persistence
# ---------------------------------------------------------------------------
def _get_config_path() -> Path:
from browser_use.skill_cli.utils import get_config_path
return get_config_path()
def _read_config() -> dict:
from browser_use.skill_cli.config import read_config
return read_config()
def _write_config(data: dict) -> None:
from browser_use.skill_cli.config import write_config
write_config(data)
def _get_api_key_or_none() -> str | None:
"""Return API key from CLI config file, or None if not found."""
from browser_use.skill_cli.config import get_config_value
val = get_config_value('api_key')
return str(val) if val is not None else None
def _get_api_key() -> str:
"""Return API key from config file. Exits with error if missing."""
key = _get_api_key_or_none()
if key:
return key
print('Error: No API key found.', file=sys.stderr)
if os.environ.get('BROWSER_USE_API_KEY'):
print(' Note: BROWSER_USE_API_KEY env var is set but not used by the CLI.', file=sys.stderr)
print(' Run: browser-use config set api_key "$BROWSER_USE_API_KEY"', file=sys.stderr)
else:
print(
'Already have an account? Get a key at: https://cloud.browser-use.com/settings?tab=api-keys&new=1&utm_source=oss&utm_medium=cli',
file=sys.stderr,
)
print(' Then run: browser-use cloud login <key>', file=sys.stderr)
print('No account? Run: browser-use cloud signup', file=sys.stderr)
print(' This creates an agent account you can claim later with: browser-use cloud signup --claim', file=sys.stderr)
sys.exit(1)
def _create_cloud_profile_inner(api_key: str) -> str:
"""Create a new cloud profile and save to config. Returns profile ID.
Raises RuntimeError on failure — safe to call from daemon context.
"""
body = json.dumps({'name': 'Browser Use CLI'}).encode()
status, resp = _http_request('POST', f'{_base_url("v2")}/profiles', body, api_key)
if status >= 400:
raise RuntimeError(f'Error creating cloud profile: HTTP {status}{resp}')
try:
data = json.loads(resp)
new_id = data['id']
except (json.JSONDecodeError, KeyError, TypeError):
raise RuntimeError(f'Unexpected response from cloud API: {resp}')
config = _read_config()
config['cloud_connect_profile_id'] = new_id
_write_config(config)
return new_id
def _create_cloud_profile() -> str:
"""Create a new cloud profile and save to config. Returns profile ID.
CLI entry point — exits on error.
"""
api_key = _get_api_key()
try:
return _create_cloud_profile_inner(api_key)
except RuntimeError as e:
print(str(e), file=sys.stderr)
sys.exit(1)
def _get_or_create_cloud_profile() -> str:
"""Return cloud profile ID from config, creating one if missing. No validation HTTP call."""
config = _read_config()
profile_id = config.get('cloud_connect_profile_id')
if profile_id:
return profile_id
return _create_cloud_profile()
def _get_cloud_connect_proxy() -> str | None:
"""Return the cloud connect proxy country code from config."""
from browser_use.skill_cli.config import get_config_value
val = get_config_value('cloud_connect_proxy')
return str(val) if val is not None else None
def _get_cloud_connect_timeout() -> int | None:
"""Return the cloud connect timeout (minutes) from config."""
from browser_use.skill_cli.config import get_config_value
val = get_config_value('cloud_connect_timeout')
return int(val) if val is not None else None
def _save_api_key(key: str) -> None:
config = _read_config()
config['api_key'] = key
_write_config(config)
def _remove_api_key() -> bool:
config = _read_config()
if 'api_key' not in config:
return False
del config['api_key']
path = _get_config_path()
if config:
_write_config(config)
else:
path.unlink(missing_ok=True)
return True
# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
def _http_request(method: str, url: str, body: bytes | None, api_key: str, timeout: float = 30.0) -> tuple[int, bytes]:
"""Fire an HTTP request. Returns (status_code, response_body)."""
headers = {_AUTH_HEADER: api_key}
if body is not None:
headers['Content-Type'] = 'application/json'
req = urllib.request.Request(url, data=body, headers=headers, method=method.upper())
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
return resp.status, resp.read()
except urllib.error.HTTPError as e:
return e.code, e.read()
except urllib.error.URLError as e:
print(f'Error: {e.reason}', file=sys.stderr)
sys.exit(1)
def _print_json(data: bytes, file: typing.TextIO | None = None) -> None:
"""Pretty-print JSON, raw fallback."""
out = file or sys.stdout
try:
parsed = json.loads(data)
print(json.dumps(parsed, indent=2), file=out)
except (json.JSONDecodeError, ValueError):
buf = out.buffer if hasattr(out, 'buffer') else sys.stdout.buffer
buf.write(data)
buf.write(b'\n')
buf.flush()
# ---------------------------------------------------------------------------
# OpenAPI help
# ---------------------------------------------------------------------------
def _fetch_spec(version: str) -> bytes | None:
url = _spec_url(version)
try:
req = urllib.request.Request(url)
with urllib.request.urlopen(req, timeout=5) as resp:
return resp.read()
except Exception:
return None
def _example_value(prop: dict, schemas: dict) -> object:
"""Generate a placeholder value for an OpenAPI property."""
if '$ref' in prop:
ref_name = prop['$ref'].rsplit('/', 1)[-1]
if ref_name in schemas:
return _generate_body_example_dict(ref_name, schemas)
return {}
t = prop.get('type', 'string')
fmt = prop.get('format', '')
enum = prop.get('enum')
if enum:
return enum[0]
if t == 'string':
if fmt == 'uri' or fmt == 'url':
return 'https://example.com'
if fmt == 'date-time':
return '2025-01-01T00:00:00Z'
if 'email' in fmt:
return 'user@example.com'
return '...'
if t == 'integer':
return 0
if t == 'number':
return 0.0
if t == 'boolean':
return False
if t == 'array':
items = prop.get('items', {})
return [_example_value(items, schemas)]
if t == 'object':
props = prop.get('properties', {})
return {k: _example_value(v, schemas) for k, v in props.items()}
return '...'
def _generate_body_example_dict(ref_name: str, schemas: dict) -> dict:
"""Build a compact example dict from a $ref schema."""
schema = schemas.get(ref_name, {})
props = schema.get('properties', {})
required = set(schema.get('required', []))
result = {}
# Required fields first, then sorted optional
for key in sorted(props, key=lambda k: (k not in required, k)):
result[key] = _example_value(props[key], schemas)
return result
def _generate_body_example(ref: str, schemas: dict) -> str:
"""Return compact JSON string for a $ref."""
ref_name = ref.rsplit('/', 1)[-1]
obj = _generate_body_example_dict(ref_name, schemas)
return json.dumps(obj, separators=(',', ':'))
def _find_body_ref(spec: dict, method: str, path: str) -> str | None:
"""Find the $ref for request body of a given method+path in spec."""
paths = spec.get('paths', {})
path_obj = paths.get(path, {})
method_obj = path_obj.get(method.lower(), {})
body = method_obj.get('requestBody', {})
content = body.get('content', {})
json_media = content.get('application/json', {})
schema = json_media.get('schema', {})
return schema.get('$ref')
def _match_path(spec_path: str, req_path: str) -> bool:
"""Match an OpenAPI template path against a concrete path.
E.g. /tasks/{task_id} matches /tasks/abc123
"""
spec_parts = spec_path.strip('/').split('/')
req_parts = req_path.strip('/').split('/')
if len(spec_parts) != len(req_parts):
return False
for sp, rp in zip(spec_parts, req_parts):
if sp.startswith('{') and sp.endswith('}'):
continue
if sp != rp:
return False
return True
def _find_body_example(spec: dict, method: str, path: str) -> str | None:
"""Find a body example for the given method+path, using template matching."""
schemas = spec.get('components', {}).get('schemas', {})
paths = spec.get('paths', {})
for spec_path in paths:
if _match_path(spec_path, path):
ref = _find_body_ref(spec, method, spec_path)
if ref:
return _generate_body_example(ref, schemas)
return None
def _format_openapi_help(spec_data: bytes) -> str:
"""Parse OpenAPI spec and render grouped endpoints."""
try:
spec = json.loads(spec_data)
except (json.JSONDecodeError, ValueError):
return ''
paths = spec.get('paths', {})
schemas = spec.get('components', {}).get('schemas', {})
info = spec.get('info', {})
lines: list[str] = []
title = info.get('title', 'API')
version = info.get('version', '')
lines.append(f'{title} {version}'.strip())
lines.append('')
# Group by tag
groups: dict[str, list[str]] = {}
for path, methods in sorted(paths.items()):
for method, details in sorted(methods.items()):
if method in ('parameters', 'summary', 'description'):
continue
tags = details.get('tags', ['Other'])
tag = tags[0] if tags else 'Other'
summary = details.get('summary', '')
# Build endpoint line
parts = [f' {method.upper():6s} {path}']
if summary:
parts.append(f' # {summary}')
# Parameters
params = details.get('parameters', [])
param_strs = []
for p in params:
name = p.get('name', '')
required = p.get('required', False)
marker = '*' if required else ''
param_strs.append(f'{name}{marker}')
if param_strs:
parts.append(f' params: {", ".join(param_strs)}')
# Body example
body_ref = _find_body_ref(spec, method, path)
if body_ref:
example = _generate_body_example(body_ref, schemas)
parts.append(f" body: '{example}'")
groups.setdefault(tag, []).append('\n'.join(parts) if len(parts) > 1 else parts[0])
for tag, endpoints in sorted(groups.items()):
lines.append(f'[{tag}]')
for ep in endpoints:
lines.append(ep)
lines.append('')
return '\n'.join(lines)
def _static_help(version: str) -> str:
"""Fallback help when OpenAPI spec is unavailable."""
return f"""Browser-Use Cloud API {version}
Usage:
browser-use cloud {version} <METHOD> <path> [body]
browser-use cloud {version} poll <task-id>
Examples:
browser-use cloud {version} GET /browsers
browser-use cloud {version} POST /tasks '{{"task":"Search for AI news","url":"https://google.com"}}'
browser-use cloud {version} GET /tasks/<task-id>
browser-use cloud {version} poll <task-id>
(Could not fetch OpenAPI spec for live endpoint listing)
"""
# ---------------------------------------------------------------------------
# Command handlers
# ---------------------------------------------------------------------------
def _cloud_login(argv: list[str]) -> int:
if not argv:
print('Usage: browser-use cloud login <api-key>', file=sys.stderr)
return 1
key = argv[0]
_save_api_key(key)
print('API key saved')
return 0
def _cloud_logout() -> int:
if _remove_api_key():
print('API key removed')
else:
print('No API key to remove')
return 0
def _cloud_rest(argv: list[str], version: str) -> int:
"""Generic REST passthrough."""
if len(argv) < 2:
print(f'Usage: browser-use cloud {version} <METHOD> <path> [body]', file=sys.stderr)
return 1
method = argv[0].upper()
path = argv[1]
body_str = argv[2] if len(argv) > 2 else None
# Normalize path
if not path.startswith('/'):
path = '/' + path
url = f'{_base_url(version)}{path}'
api_key = _get_api_key()
body = body_str.encode() if body_str else None
status, resp_body = _http_request(method, url, body, api_key)
if 400 <= status < 500:
print(f'HTTP {status}', file=sys.stderr)
_print_json(resp_body, file=sys.stderr)
# Try to suggest correct body from spec
spec_data = _fetch_spec(version)
if spec_data:
try:
spec = json.loads(spec_data)
example = _find_body_example(spec, method, path)
if example:
print(f"\nExpected body: '{example}'", file=sys.stderr)
except (json.JSONDecodeError, ValueError):
pass
return 2
if status >= 500:
print(f'HTTP {status}', file=sys.stderr)
_print_json(resp_body, file=sys.stderr)
return 1
_print_json(resp_body)
return 0
def _cloud_poll(argv: list[str], version: str) -> int:
"""Poll GET /tasks/<id> until done."""
if not argv:
print(f'Usage: browser-use cloud {version} poll <task-id>', file=sys.stderr)
return 1
task_id = argv[0]
url = f'{_base_url(version)}/tasks/{task_id}'
api_key = _get_api_key()
while True:
status_code, resp_body = _http_request('GET', url, None, api_key)
if status_code >= 400:
print(f'\nHTTP {status_code}', file=sys.stderr)
_print_json(resp_body, file=sys.stderr)
return 2
try:
data = json.loads(resp_body)
except (json.JSONDecodeError, ValueError):
print('\nError: invalid JSON response', file=sys.stderr)
return 1
task_status = data.get('status', 'unknown')
cost = data.get('cost', 0)
print(f'\rstatus: {task_status} cost: ${cost:.4f}', end='', file=sys.stderr, flush=True)
if task_status == 'finished':
print('', file=sys.stderr) # newline
_print_json(resp_body)
return 0
if task_status == 'failed':
print('', file=sys.stderr)
_print_json(resp_body, file=sys.stderr)
return 2
time.sleep(2)
def _cloud_help(version: str) -> int:
"""Show OpenAPI-driven help for a version."""
spec_data = _fetch_spec(version)
if spec_data:
formatted = _format_openapi_help(spec_data)
if formatted:
print(formatted)
return 0
print(_static_help(version))
return 0
def _cloud_versioned(argv: list[str], version: str) -> int:
"""Route versioned subcommands: poll, help, or REST passthrough."""
if not argv:
return _cloud_help(version)
first = argv[0]
if first in ('--help', 'help', '-h'):
return _cloud_help(version)
if first == 'poll':
return _cloud_poll(argv[1:], version)
# REST passthrough: METHOD path [body]
return _cloud_rest(argv, version)
# ---------------------------------------------------------------------------
# Signup (agent self-registration)
# ---------------------------------------------------------------------------
def _signup_challenge() -> int:
"""Request a signup challenge."""
if _get_api_key_or_none():
print('You already have an API key configured.', file=sys.stderr)
print('Run `browser-use cloud signup --claim` to claim your account.', file=sys.stderr)
return 1
body = json.dumps({}).encode()
status, resp = _http_request('POST', f'{_get_base()}/cloud/signup', body, api_key='')
if status >= 400:
print(f'Error: HTTP {status}', file=sys.stderr)
_print_json(resp, file=sys.stderr)
return 1
try:
data = json.loads(resp)
except (json.JSONDecodeError, ValueError):
print('Error: invalid response', file=sys.stderr)
return 1
print(f'Challenge ID: {data["challenge_id"]}')
print(f'Challenge: {data["challenge_text"]}')
print()
print('Verify to create your agent account:')
print(' browser-use cloud signup --verify <challenge-id> <answer>')
return 0
def _signup_verify(challenge_id: str, answer: str) -> int:
"""Verify a signup challenge and save the API key."""
if _get_api_key_or_none():
print('You already have an API key configured.', file=sys.stderr)
print('Run `browser-use cloud signup --claim` to claim your account.', file=sys.stderr)
return 1
body = json.dumps({'challenge_id': challenge_id, 'answer': answer}).encode()
status, resp = _http_request('POST', f'{_get_base()}/cloud/signup/verify', body, api_key='')
if status >= 400:
print(f'Error: HTTP {status}', file=sys.stderr)
_print_json(resp, file=sys.stderr)
return 1
try:
data = json.loads(resp)
except (json.JSONDecodeError, ValueError):
print('Error: invalid response', file=sys.stderr)
return 1
_save_api_key(data['api_key'])
print('API key saved')
return 0
def _signup_claim() -> int:
"""Generate a claim URL for the current API key."""
api_key = _get_api_key()
status, resp = _http_request('POST', f'{_get_base()}/cloud/signup/claim', None, api_key)
if status >= 400:
print(f'Error: HTTP {status}', file=sys.stderr)
_print_json(resp, file=sys.stderr)
return 1
try:
data = json.loads(resp)
except (json.JSONDecodeError, ValueError):
print('Error: invalid response', file=sys.stderr)
return 1
print(f'Claim URL: {data["claim_url"]}')
print('Share this URL with a human to claim ownership of this account.')
return 0
# ---------------------------------------------------------------------------
# Main dispatcher
# ---------------------------------------------------------------------------
def handle_cloud_command(argv: list[str]) -> int:
"""Main dispatcher for `browser-use cloud ...`."""
if not argv:
_print_cloud_usage()
return 1
subcmd = argv[0]
if subcmd == 'login':
return _cloud_login(argv[1:])
if subcmd == 'logout':
return _cloud_logout()
if subcmd in ('v2', 'v3'):
return _cloud_versioned(argv[1:], subcmd)
if subcmd == 'signup':
if '--verify' in argv:
idx = argv.index('--verify')
if idx + 2 >= len(argv):
print('Usage: browser-use cloud signup --verify <challenge-id> <answer>', file=sys.stderr)
return 1
return _signup_verify(argv[idx + 1], argv[idx + 2])
if '--claim' in argv:
return _signup_claim()
return _signup_challenge()
if subcmd == 'connect':
# Normally intercepted by main.py before reaching here
print('Error: cloud connect must be run via the main CLI (browser-use cloud connect)', file=sys.stderr)
return 1
if subcmd in ('--help', 'help', '-h'):
_print_cloud_usage()
return 0
print(f'Unknown cloud subcommand: {subcmd}', file=sys.stderr)
_print_cloud_usage()
return 1
def _print_cloud_usage() -> None:
print('Usage: browser-use cloud <command>')
print()
print('Commands:')
print(' connect Provision cloud browser and connect')
print(' signup Create an agent account (challenge-response)')
print(' signup --verify <id> <answer> Verify challenge and save API key')
print(' signup --claim Generate URL to claim your agent account')
print(' login <api-key> Save API key')
print(' logout Remove API key')
print(' v2 <METHOD> <path> [body] REST passthrough (API v2)')
print(' v3 <METHOD> <path> [body] REST passthrough (API v3)')
print(' v2 poll <task-id> Poll task until done')
print(' v2 --help Show API v2 endpoints')
print(' v3 --help Show API v3 endpoints')
print()
print('Examples:')
print(' browser-use cloud login sk-abc123...')
print(' browser-use cloud v2 GET /browsers')
print(' browser-use cloud v2 POST /tasks \'{"task":"...","url":"https://..."}\'')
print(' browser-use cloud v2 poll <task-id>')

View File

@@ -1,423 +0,0 @@
"""Cloud session SDK wrappers and CLI handlers.
This module provides:
- SDK wrapper functions for the Browser-Use Cloud Session API
- CLI command handlers for `browser-use session <command>`
"""
import argparse
import json
import logging
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any
from browser_use_sdk.types.session_item_view import SessionItemView
from browser_use_sdk.types.session_view import SessionView
from browser_use_sdk.types.share_view import ShareView
from browser_use.skill_cli.commands.utils import format_duration, get_sdk_client
logger = logging.getLogger(__name__)
# ============ SDK Wrappers ============
def create_session(**kwargs: Any) -> SessionItemView:
"""Create a cloud browser session.
Args:
profile_id: Cloud profile ID for persistent auth/cookies
proxy_country: Proxy country code (us, gb, de, etc.)
keep_alive: Keep session alive after task completes
persist_memory: Share memory between tasks in session
start_url: URL to navigate to when session starts
screen_width: Browser screen width in pixels
screen_height: Browser screen height in pixels
Returns:
SessionItemView with session details
"""
# Map our param names to SDK param names
param_map = {
'proxy_country': 'proxy_country_code',
'screen_width': 'browser_screen_width',
'screen_height': 'browser_screen_height',
}
params = {}
for k, v in kwargs.items():
if v is not None:
params[param_map.get(k, k)] = v
return get_sdk_client().sessions.create_session(**params)
def list_sessions(limit: int = 10, status: str | None = None) -> list[SessionItemView]:
"""List cloud browser sessions."""
client = get_sdk_client()
response = client.sessions.list_sessions(
page_size=min(limit, 100),
filter_by=status,
)
return list(response.items) if response.items else []
def get_session(session_id: str) -> SessionView:
"""Get details of a specific session."""
return get_sdk_client().sessions.get_session(session_id)
def stop_session(session_id: str) -> SessionView:
"""Stop a cloud session."""
return get_sdk_client().sessions.update_session(session_id, action='stop')
def delete_session(session_id: str) -> None:
"""Delete a cloud session and all its tasks."""
get_sdk_client().sessions.delete_session(session_id)
def create_public_share(session_id: str) -> ShareView:
"""Create a public share URL for a session."""
return get_sdk_client().sessions.create_session_public_share(session_id)
def delete_public_share(session_id: str) -> None:
"""Delete the public share for a session."""
get_sdk_client().sessions.delete_session_public_share(session_id)
def stop_sessions_parallel(session_ids: list[str]) -> tuple[list[str], list[dict[str, Any]]]:
"""Stop multiple cloud sessions in parallel."""
client = get_sdk_client()
stopped: list[str] = []
errors: list[dict[str, Any]] = []
def stop_one(sid: str) -> tuple[str, str | None]:
try:
client.sessions.update_session(sid, action='stop')
return (sid, None)
except Exception as e:
return (sid, str(e))
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(stop_one, sid): sid for sid in session_ids}
for future in as_completed(futures):
sid, error = future.result()
if error:
errors.append({'id': sid, 'error': error})
else:
stopped.append(sid)
return stopped, errors
# ============ CLI Handlers ============
def handle_session_command(args: argparse.Namespace) -> int:
"""Handle session subcommands.
Session commands manage cloud sessions and always require the cloud API.
Args:
args: Parsed command-line arguments
Returns:
Exit code (0 for success, 1 for error)
"""
from browser_use.skill_cli.api_key import APIKeyRequired, require_api_key
from browser_use.skill_cli.install_config import is_mode_available
# Check if remote mode is available
if not is_mode_available('remote'):
print(
'Error: Session management requires remote mode.\n'
'Remote mode is not installed. Reinstall to enable:\n'
' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only\n'
' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full',
file=sys.stderr,
)
return 1
# Check API key
try:
require_api_key('Cloud sessions')
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if args.session_command == 'list':
return _handle_list(args)
elif args.session_command == 'get':
return _handle_get(args)
elif args.session_command == 'stop':
return _handle_stop(args)
elif args.session_command == 'create':
return _handle_create(args)
elif args.session_command == 'share':
return _handle_share(args)
else:
print('Usage: browser-use session <command>')
print('Commands: list, get <id>, stop <id>, create, share <id>')
return 1
# ============ CLI Helper Functions ============
def _session_to_dict(session: Any) -> dict[str, Any]:
"""Convert SDK session object to dict for JSON output."""
return {
'id': session.id,
'status': session.status,
'liveUrl': session.live_url,
'startedAt': session.started_at.isoformat() if session.started_at else None,
'finishedAt': session.finished_at.isoformat() if session.finished_at else None,
'keepAlive': session.keep_alive,
'persistMemory': getattr(session, 'persist_memory', None),
'proxyCost': session.proxy_cost,
'publicShareUrl': getattr(session, 'public_share_url', None),
}
def _handle_list(args: argparse.Namespace) -> int:
"""Handle 'session list' command."""
try:
status_filter = getattr(args, 'status', None)
sessions = list_sessions(limit=args.limit, status=status_filter)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps([_session_to_dict(s) for s in sessions]))
else:
if not sessions:
status_msg = f' with status "{status_filter}"' if status_filter else ''
print(f'No sessions found{status_msg}')
else:
header = f'Sessions ({len(sessions)})'
if status_filter:
header = f'{status_filter.capitalize()} sessions ({len(sessions)})'
print(f'{header}:')
for s in sessions:
session_id = s.id or 'unknown'
status = s.status or 'unknown'
live_url = s.live_url
started_at = s.started_at
finished_at = s.finished_at
keep_alive = '🔄' if s.keep_alive else ''
# Status emoji
status_emoji = {
'active': '🟢',
'stopped': '⏹️',
}.get(status, '')
# Truncate ID for display
short_id = session_id[:8] + '...' if len(session_id) > 8 else session_id
# Build line with duration
duration = format_duration(started_at, finished_at)
line = f' {status_emoji} {short_id} [{status}]'
if duration:
line += f' {duration}'
if keep_alive:
line += f' {keep_alive}'
if live_url and status == 'active':
line += f'\n live: {live_url}'
print(line)
return 0
def _handle_get(args: argparse.Namespace) -> int:
"""Handle 'session get <session_id>' command."""
try:
session = get_session(args.session_id)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps(_session_to_dict(session)))
else:
session_id = session.id or args.session_id
status = session.status or 'unknown'
live_url = session.live_url
started_at = session.started_at
finished_at = session.finished_at
keep_alive = session.keep_alive
proxy_cost = session.proxy_cost
public_share_url = getattr(session, 'public_share_url', None)
# Status emoji
status_emoji = {
'active': '🟢',
'stopped': '⏹️',
}.get(status, '')
# Build header with duration
duration = format_duration(started_at, finished_at)
header_parts = [f'{status_emoji} {session_id[:8]}... [{status}]']
if duration:
header_parts.append(duration)
if proxy_cost:
# Format proxy cost to 2 decimal places
try:
cost_val = float(proxy_cost)
header_parts.append(f'${cost_val:.2f}')
except (ValueError, TypeError):
header_parts.append(f'${proxy_cost}')
print(' '.join(header_parts))
if keep_alive:
print(' Keep Alive: Yes')
if live_url:
print(f' Live URL: {live_url}')
if public_share_url:
print(f' Public Share: {public_share_url}')
return 0
def _handle_stop(args: argparse.Namespace) -> int:
"""Handle 'session stop <session_id>' command."""
# Handle --all flag
if getattr(args, 'all', False):
return _handle_stop_all(args)
try:
stop_session(args.session_id)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps({'stopped': args.session_id}))
else:
print(f'Stopped session: {args.session_id}')
return 0
def _handle_stop_all(args: argparse.Namespace) -> int:
"""Handle 'session stop --all' command."""
try:
# Get all active sessions
sessions = list_sessions(limit=100, status='active')
except Exception as e:
print(f'Error listing sessions: {e}', file=sys.stderr)
return 1
if not sessions:
print('No active sessions to stop')
return 0
# Extract session IDs
session_ids = [s.id for s in sessions if s.id]
if not session_ids:
print('No active sessions to stop')
return 0
# Stop all sessions in parallel
stopped, errors = stop_sessions_parallel(session_ids)
if getattr(args, 'json', False):
print(json.dumps({'stopped': stopped, 'errors': errors}))
else:
if stopped:
print(f'Stopped {len(stopped)} session(s):')
for sid in stopped:
print(f'{sid[:8]}...')
if errors:
print(f'Failed to stop {len(errors)} session(s):')
for err in errors:
print(f'{err["id"][:8]}...: {err["error"]}')
return 0 if not errors else 1
def _handle_create(args: argparse.Namespace) -> int:
"""Handle 'session create' command."""
# Parse screen size if provided
screen_width = None
screen_height = None
if hasattr(args, 'screen_size') and args.screen_size:
try:
w, h = args.screen_size.lower().split('x')
screen_width = int(w)
screen_height = int(h)
except ValueError:
print('Error: Invalid screen size format. Use WxH (e.g., 1920x1080)', file=sys.stderr)
return 1
try:
session = create_session(
profile_id=getattr(args, 'profile', None),
proxy_country=getattr(args, 'proxy_country', None),
keep_alive=getattr(args, 'keep_alive', None),
persist_memory=getattr(args, 'persist_memory', None),
start_url=getattr(args, 'start_url', None),
screen_width=screen_width,
screen_height=screen_height,
)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps(_session_to_dict(session)))
else:
print(f'Created session: {session.id}')
if session.live_url:
print(f' Live URL: {session.live_url}')
return 0
def _handle_share(args: argparse.Namespace) -> int:
"""Handle 'session share <session_id>' command."""
session_id = args.session_id
# Delete share if requested
if getattr(args, 'delete', False):
try:
delete_public_share(session_id)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps({'deleted': session_id}))
else:
print(f'Deleted public share for session: {session_id}')
return 0
# Create share
try:
share = create_public_share(session_id)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(
json.dumps(
{
'sessionId': session_id,
'url': share.share_url,
'shareToken': share.share_token,
'viewCount': share.view_count,
}
)
)
else:
print(f'Public share created for session: {session_id}')
if share.share_url:
print(f' URL: {share.share_url}')
return 0

View File

@@ -1,413 +0,0 @@
"""Cloud task SDK wrappers and CLI handlers.
This module provides:
- SDK wrapper functions for the Browser-Use Cloud Task API
- CLI command handlers for `browser-use task <command>`
"""
import argparse
import json
import logging
import sys
from typing import Any
from browser_use_sdk.types.task_created_response import TaskCreatedResponse
from browser_use_sdk.types.task_item_view import TaskItemView
from browser_use_sdk.types.task_log_file_response import TaskLogFileResponse
from browser_use_sdk.types.task_view import TaskView
from browser_use.skill_cli.commands.utils import format_duration, get_sdk_client
logger = logging.getLogger(__name__)
def _filter_none(kwargs: dict[str, Any]) -> dict[str, Any]:
"""Filter out None values from kwargs (SDK passes them as null, API rejects)."""
return {k: v for k, v in kwargs.items() if v is not None}
# ============ SDK Wrappers ============
def create_task(task: str, **kwargs: Any) -> TaskCreatedResponse:
"""Create a cloud task via API.
Args:
task: Task description for the agent
llm: LLM model identifier
session_id: Existing session ID to use
max_steps: Maximum agent steps
flash_mode: Enable flash mode for faster execution
thinking: Enable extended reasoning mode
vision: Enable/disable vision
start_url: URL to start the task from
metadata: Task metadata key-value pairs
secrets: Task secrets key-value pairs
allowed_domains: Restrict navigation to these domains
skill_ids: Enable specific skill IDs
structured_output: JSON schema for structured output
judge: Enable judge mode
judge_ground_truth: Expected answer for judge evaluation
Returns:
TaskCreatedResponse with task ID and session ID
"""
params = _filter_none(kwargs)
params['task'] = task
return get_sdk_client().tasks.create_task(**params)
def get_task(task_id: str) -> TaskView:
"""Get full task details including steps."""
return get_sdk_client().tasks.get_task(task_id)
def list_tasks(
limit: int = 10,
status: str | None = None,
session_id: str | None = None,
) -> list[TaskItemView]:
"""List recent tasks."""
client = get_sdk_client()
response = client.tasks.list_tasks(
page_size=limit,
**_filter_none({'filter_by': status, 'session_id': session_id}),
)
return list(response.items) if response.items else []
def stop_task(task_id: str) -> TaskView:
"""Stop a running task."""
return get_sdk_client().tasks.update_task(task_id, action='stop')
def get_task_logs(task_id: str) -> TaskLogFileResponse:
"""Get task execution logs."""
return get_sdk_client().tasks.get_task_logs(task_id)
async def poll_until_complete(
task_id: str,
stream: bool = False,
poll_interval: float = 1.0,
) -> TaskView:
"""Poll task status until finished."""
import asyncio
client = get_sdk_client()
last_status = None
while True:
# Run blocking SDK call in thread to avoid blocking event loop
task = await asyncio.to_thread(client.tasks.get_task, task_id)
current_status = task.status
if stream and current_status != last_status:
print(f'Status: {current_status}')
last_status = current_status
if current_status in ('finished', 'stopped', 'failed'):
return task
await asyncio.sleep(poll_interval)
# ============ CLI Handlers ============
def handle_task_command(args: argparse.Namespace) -> int:
"""Handle task subcommands.
Task commands manage cloud tasks and always require the cloud API.
Args:
args: Parsed command-line arguments
Returns:
Exit code (0 for success, 1 for error)
"""
from browser_use.skill_cli.api_key import APIKeyRequired, require_api_key
from browser_use.skill_cli.install_config import is_mode_available
# Check if remote mode is available
if not is_mode_available('remote'):
print(
'Error: Task management requires remote mode.\n'
'Remote mode is not installed. Reinstall to enable:\n'
' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only\n'
' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full',
file=sys.stderr,
)
return 1
# Check API key
try:
require_api_key('Cloud tasks')
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if args.task_command == 'list':
return _handle_list(args)
elif args.task_command == 'status':
return _handle_status(args)
elif args.task_command == 'stop':
return _handle_stop(args)
elif args.task_command == 'logs':
return _handle_logs(args)
else:
print('Usage: browser-use task <command>')
print('Commands: list, status <task_id>, stop <task_id>, logs <task_id>')
return 1
# ============ CLI Helper Functions ============
def _task_item_to_dict(task: Any) -> dict[str, Any]:
"""Convert SDK TaskItemView to dict for JSON output."""
return {
'id': task.id,
'status': task.status,
'task': task.task,
'sessionId': task.session_id,
}
def _task_to_dict(task: Any) -> dict[str, Any]:
"""Convert SDK TaskView to dict for JSON output."""
return {
'id': task.id,
'status': task.status,
'task': task.task,
'output': task.output,
'cost': task.cost,
'sessionId': task.session_id,
'startedAt': task.started_at.isoformat() if task.started_at else None,
'finishedAt': task.finished_at.isoformat() if task.finished_at else None,
'steps': [_step_to_dict(s) for s in (task.steps or [])],
}
def _step_to_dict(step: Any) -> dict[str, Any]:
"""Convert SDK step to dict for JSON output."""
return {
'number': step.number,
'url': step.url,
'memory': step.memory,
'actions': step.actions,
}
def _handle_list(args: argparse.Namespace) -> int:
"""Handle 'task list' command."""
try:
status_filter = getattr(args, 'status', None)
session_filter = getattr(args, 'session', None)
tasks = list_tasks(
limit=args.limit,
status=status_filter,
session_id=session_filter,
)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps([_task_item_to_dict(t) for t in tasks]))
else:
if not tasks:
status_msg = f' with status "{status_filter}"' if status_filter else ''
session_msg = f' in session "{session_filter}"' if session_filter else ''
print(f'No tasks found{status_msg}{session_msg}')
else:
header = f'Tasks ({len(tasks)})'
if status_filter:
header = f'{status_filter.capitalize()} tasks ({len(tasks)})'
print(f'{header}:')
for t in tasks:
task_id = t.id or 'unknown'
status = t.status or 'unknown'
task_desc = t.task or ''
# Truncate long task descriptions
if len(task_desc) > 50:
task_desc = task_desc[:47] + '...'
# Status emoji
status_emoji = {
'started': '🔄',
'running': '🔄',
'finished': '',
'stopped': '⏹️',
'failed': '',
}.get(status, '')
print(f' {status_emoji} {task_id[:8]}... [{status}] {task_desc}')
return 0
def _handle_status(args: argparse.Namespace) -> int:
"""Handle 'task status <task_id>' command."""
try:
# Use get_task() for full details including steps
task = get_task(args.task_id)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps(_task_to_dict(task)))
else:
task_id = task.id or args.task_id
task_status = task.status or 'unknown'
output = task.output
cost = task.cost
steps = task.steps or []
started_at = task.started_at
finished_at = task.finished_at
compact = getattr(args, 'compact', False)
verbose = getattr(args, 'verbose', False)
last_n = getattr(args, 'last', None)
reverse = getattr(args, 'reverse', False)
specific_step = getattr(args, 'step', None)
# Determine display mode:
# - Default: show only latest step
# - --compact: show all steps with reasoning
# - --verbose: show all steps with full details
show_all_steps = compact or verbose
# Status emoji
status_emoji = {
'started': '🔄',
'running': '🔄',
'finished': '',
'stopped': '⏹️',
'failed': '',
}.get(task_status, '')
# Build header line: status, cost, duration
parts = [f'{status_emoji} {task_id[:8]}... [{task_status}]']
if cost is not None:
parts.append(f'${cost}')
duration = format_duration(started_at, finished_at)
if duration:
parts.append(duration)
print(' '.join(parts))
# Show steps
if steps:
total_steps = len(steps)
# Filter to specific step if requested
if specific_step is not None:
steps = [s for s in steps if s.number == specific_step]
if not steps:
print(f' Step {specific_step} not found (task has {total_steps} steps)')
else:
print(f' (showing step {specific_step} of {total_steps})')
# Display the specific step
for step in steps:
_print_step(step, verbose)
elif not show_all_steps:
# Default mode: show only the latest step
latest_step = steps[-1]
earlier_count = total_steps - 1
if earlier_count > 0:
print(f' ... {earlier_count} earlier steps')
_print_step(latest_step, verbose=False)
else:
# --compact or --verbose: show all steps (with optional filters)
skipped_earlier = 0
if last_n is not None and last_n < total_steps:
skipped_earlier = total_steps - last_n
steps = steps[-last_n:]
# Apply --reverse
if reverse:
steps = list(reversed(steps))
# Show count info
if skipped_earlier > 0:
print(f' ... {skipped_earlier} earlier steps')
# Display steps
for step in steps:
_print_step(step, verbose)
if output:
print(f'\nOutput: {output}')
return 0
def _print_step(step: Any, verbose: bool) -> None:
"""Print a single step in compact or verbose format."""
step_num = step.number if step.number is not None else '?'
memory = step.memory or ''
if verbose:
url = step.url or ''
actions = step.actions or []
# Truncate URL for display
short_url = url[:60] + '...' if len(url) > 60 else url
print(f' [{step_num}] {short_url}')
if memory:
# Truncate memory/reasoning for display
short_memory = memory[:100] + '...' if len(memory) > 100 else memory
print(f' Reasoning: {short_memory}')
if actions:
for action in actions[:2]: # Show max 2 actions per step
# Truncate action for display
short_action = action[:70] + '...' if len(action) > 70 else action
print(f' Action: {short_action}')
if len(actions) > 2:
print(f' ... and {len(actions) - 2} more actions')
else:
# Compact mode: just step number and reasoning
if memory:
# Truncate reasoning for compact display
short_memory = memory[:80] + '...' if len(memory) > 80 else memory
print(f' {step_num}. {short_memory}')
else:
print(f' {step_num}. (no reasoning)')
def _handle_stop(args: argparse.Namespace) -> int:
"""Handle 'task stop <task_id>' command."""
try:
stop_task(args.task_id)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps({'stopped': args.task_id}))
else:
print(f'Stopped task: {args.task_id}')
return 0
def _handle_logs(args: argparse.Namespace) -> int:
"""Handle 'task logs <task_id>' command."""
try:
result = get_task_logs(args.task_id)
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps({'downloadUrl': result.download_url}))
else:
download_url = result.download_url
if download_url:
print(f'Download logs: {download_url}')
else:
print('No logs available for this task')
return 0

View File

@@ -9,8 +9,6 @@ from typing import Any
logger = logging.getLogger(__name__)
COMMANDS = {'doctor'}
async def handle() -> dict[str, Any]:
"""Run health checks and return results."""
@@ -22,14 +20,14 @@ async def handle() -> dict[str, Any]:
# 2. Browser availability
checks['browser'] = _check_browser()
# 3. API key configuration
checks['api_key'] = _check_api_key_config()
# 3. Network connectivity (basic check)
checks['network'] = await _check_network()
# 4. Cloudflared availability
# 4. Optional: cloudflared (for browser-use tunnel)
checks['cloudflared'] = _check_cloudflared()
# 5. Network connectivity (basic check)
checks['network'] = await _check_network()
# 5. Optional: profile-use (for browser-use profile)
checks['profile_use'] = _check_profile_use()
# Determine overall status
all_ok = all(check.get('status') == 'ok' for check in checks.values())
@@ -64,8 +62,7 @@ def _check_browser() -> dict[str, Any]:
try:
from browser_use.browser.profile import BrowserProfile
# Just check if we can import and create a profile
profile = BrowserProfile(headless=True)
BrowserProfile(headless=True) # verify import + constructor work
return {
'status': 'ok',
'message': 'Browser profile available',
@@ -78,45 +75,6 @@ def _check_browser() -> dict[str, Any]:
}
def _check_api_key_config() -> dict[str, Any]:
"""Check if API key is configured."""
from browser_use.skill_cli.api_key import check_api_key
status = check_api_key()
if status['available']:
return {
'status': 'ok',
'message': f'API key configured ({status["source"]})',
}
else:
return {
'status': 'missing',
'message': 'No API key configured',
'note': 'Required for remote browser. Get one at https://browser-use.com/new-api-key',
}
def _check_cloudflared() -> dict[str, Any]:
"""Check if cloudflared is available."""
from browser_use.skill_cli.tunnel import get_tunnel_manager
tunnel_mgr = get_tunnel_manager()
status_info = tunnel_mgr.get_status()
if status_info['available']:
return {
'status': 'ok',
'message': f'Cloudflared available ({status_info["source"]})',
'note': status_info.get('note'),
}
else:
return {
'status': 'missing',
'message': 'Cloudflared not available',
'note': 'Will be auto-installed on first tunnel use',
}
async def _check_network() -> dict[str, Any]:
"""Check basic network connectivity."""
try:
@@ -140,6 +98,40 @@ async def _check_network() -> dict[str, Any]:
}
def _check_cloudflared() -> dict[str, Any]:
"""Check if cloudflared is available (needed for browser-use tunnel)."""
from browser_use.skill_cli.tunnel import get_tunnel_manager
status = get_tunnel_manager().get_status()
if status['available']:
return {
'status': 'ok',
'message': f'cloudflared installed ({status["path"]})',
}
return {
'status': 'missing',
'message': 'cloudflared not installed (needed for browser-use tunnel)',
'fix': 'Install cloudflared: https://developers.cloudflare.com/cloudflare-one/connections/connect-networks/downloads/',
}
def _check_profile_use() -> dict[str, Any]:
"""Check if profile-use binary is available (needed for browser-use profile)."""
from browser_use.skill_cli.profile_use import get_profile_use_binary
binary = get_profile_use_binary()
if binary:
return {
'status': 'ok',
'message': f'profile-use installed ({binary})',
}
return {
'status': 'missing',
'message': 'profile-use not installed (needed for browser-use profile)',
'fix': 'browser-use profile update',
}
def _summarize_checks(checks: dict[str, dict[str, Any]]) -> str:
"""Generate a summary of check results."""
ok = sum(1 for c in checks.values() if c.get('status') == 'ok')

View File

@@ -1,703 +0,0 @@
"""Profile management command handlers.
Unified profile management that works with both local Chrome profiles and cloud profiles.
The behavior is determined by the browser mode (-b real or -b remote).
"""
import argparse
import json
import logging
import sys
import tempfile
from pathlib import Path
from typing import Any, Literal
from browser_use.skill_cli.commands.utils import get_sdk_client
logger = logging.getLogger(__name__)
ProfileMode = Literal['real', 'remote']
class ProfileModeError(Exception):
"""Raised when profile mode cannot be determined or is invalid."""
pass
def get_profile_mode(args: argparse.Namespace) -> ProfileMode:
"""Determine profile mode from -b flag or install config.
Args:
args: Parsed command-line arguments with browser attribute
Returns:
'real' for local Chrome profiles, 'remote' for cloud profiles
Raises:
ProfileModeError: If mode cannot be determined or chromium mode is used
"""
from browser_use.skill_cli.install_config import is_mode_available
browser_mode = getattr(args, 'browser', None)
# Explicit mode specified
if browser_mode == 'real':
return 'real'
elif browser_mode == 'remote':
return 'remote'
elif browser_mode == 'chromium':
raise ProfileModeError(
'Profile commands are not supported in chromium mode.\n'
'Use -b real for local Chrome profiles or -b remote for cloud profiles.'
)
# No explicit mode - try to infer from install config
local_available = is_mode_available('real')
remote_available = is_mode_available('remote')
if local_available and not remote_available:
return 'real'
elif remote_available and not local_available:
return 'remote'
elif local_available and remote_available:
raise ProfileModeError(
'Both local and remote modes are available.\n'
'Specify -b real for local Chrome profiles or -b remote for cloud profiles.'
)
else:
raise ProfileModeError('No profile modes available. Run browser-use setup first.')
def handle_profile_command(args: argparse.Namespace) -> int:
"""Handle profile subcommands.
Routes to local or cloud implementation based on browser mode.
"""
command = args.profile_command
# Commands that don't need mode inference
if command is None:
_print_usage()
return 1
# For sync command, we need special handling (local → cloud)
if command == 'sync':
return _handle_sync(args)
# Get profile mode for all other commands
try:
mode = get_profile_mode(args)
except ProfileModeError as e:
print(f'Error: {e}', file=sys.stderr)
return 1
# Route to appropriate handler
if command == 'list':
return _handle_list(args, mode)
elif command == 'get':
return _handle_get(args, mode)
elif command == 'create':
return _handle_create(args, mode)
elif command == 'update':
return _handle_update(args, mode)
elif command == 'delete':
return _handle_delete(args, mode)
elif command == 'cookies':
return _handle_cookies(args, mode)
else:
_print_usage()
return 1
def _print_usage() -> None:
"""Print profile command usage."""
print('Usage: browser-use [-b real|remote] profile <command>')
print()
print('Commands:')
print(' list List profiles')
print(' get <id> Get profile details')
print(' create Create a new profile (remote only)')
print(' update <id> Update profile')
print(' delete <id> Delete profile')
print(' cookies <id> Show cookies by domain (real only)')
print(' sync Sync local profile to cloud')
print()
print('The -b flag determines which profile system to use:')
print(' -b real Local Chrome profiles')
print(' -b remote Cloud profiles (requires API key)')
# -----------------------------------------------------------------------------
# List profiles
# -----------------------------------------------------------------------------
def _handle_list(args: argparse.Namespace, mode: ProfileMode) -> int:
"""Handle 'profile list' command."""
if mode == 'real':
return _list_local_profiles(args)
else:
return _list_cloud_profiles(args)
def _list_local_profiles(args: argparse.Namespace) -> int:
"""List local Chrome profiles."""
profiles = list_local_chrome_profiles()
if getattr(args, 'json', False):
print(json.dumps({'profiles': profiles}))
else:
if profiles:
print('Local Chrome profiles:')
for p in profiles:
print(f' {p["id"]}: {p["name"]} ({p["email"]})')
else:
print('No Chrome profiles found')
return 0
def _list_cloud_profiles(args: argparse.Namespace) -> int:
"""List cloud profiles."""
from browser_use.skill_cli.api_key import APIKeyRequired
page = getattr(args, 'page', 1)
page_size = getattr(args, 'page_size', 20)
try:
client = get_sdk_client()
response = client.profiles.list_profiles(page_number=page, page_size=page_size)
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
# Convert to dict for JSON output
data = {
'items': [{'id': p.id, 'name': p.name} for p in response.items],
'totalItems': response.total_items,
'pageNumber': response.page_number,
'pageSize': response.page_size,
}
print(json.dumps(data))
else:
if response.items:
print(f'Cloud profiles ({len(response.items)}/{response.total_items}):')
for p in response.items:
name = p.name or 'Unnamed'
print(f' {p.id}: {name}')
else:
print('No cloud profiles found')
return 0
# -----------------------------------------------------------------------------
# Get profile
# -----------------------------------------------------------------------------
def _handle_get(args: argparse.Namespace, mode: ProfileMode) -> int:
"""Handle 'profile get <id>' command."""
if mode == 'real':
return _get_local_profile(args)
else:
return _get_cloud_profile(args)
def _get_local_profile(args: argparse.Namespace) -> int:
"""Get local Chrome profile details."""
profiles = list_local_chrome_profiles()
profile_id = args.id
for p in profiles:
if p['id'] == profile_id or p['name'] == profile_id:
if getattr(args, 'json', False):
print(json.dumps(p))
else:
print(f'Profile: {p["id"]}')
print(f' Name: {p["name"]}')
print(f' Email: {p["email"]}')
return 0
print(f'Error: Profile "{profile_id}" not found', file=sys.stderr)
return 1
def _get_cloud_profile(args: argparse.Namespace) -> int:
"""Get cloud profile details."""
from browser_use.skill_cli.api_key import APIKeyRequired
try:
client = get_sdk_client()
profile = client.profiles.get_profile(args.id)
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
data = {
'id': profile.id,
'name': profile.name,
'createdAt': profile.created_at.isoformat() if profile.created_at else None,
'updatedAt': profile.updated_at.isoformat() if profile.updated_at else None,
}
print(json.dumps(data))
else:
print(f'Profile: {profile.id}')
if profile.name:
print(f' Name: {profile.name}')
if profile.created_at:
print(f' Created: {profile.created_at.isoformat()}')
if profile.updated_at:
print(f' Updated: {profile.updated_at.isoformat()}')
return 0
# -----------------------------------------------------------------------------
# Create profile
# -----------------------------------------------------------------------------
def _handle_create(args: argparse.Namespace, mode: ProfileMode) -> int:
"""Handle 'profile create' command."""
if mode == 'real':
print('Error: Cannot create local Chrome profiles via CLI.', file=sys.stderr)
print('Use Chrome browser to create new profiles.', file=sys.stderr)
return 1
return _create_cloud_profile(args)
def _create_cloud_profile(args: argparse.Namespace) -> int:
"""Create a cloud profile."""
from browser_use.skill_cli.api_key import APIKeyRequired
try:
client = get_sdk_client()
params = {}
if args.name:
params['name'] = args.name
profile = client.profiles.create_profile(**params)
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps({'id': profile.id, 'name': profile.name}))
else:
print(f'Created profile: {profile.id}')
return 0
# -----------------------------------------------------------------------------
# Update profile
# -----------------------------------------------------------------------------
def _handle_update(args: argparse.Namespace, mode: ProfileMode) -> int:
"""Handle 'profile update <id>' command."""
if mode == 'real':
print('Error: Cannot update local Chrome profiles via CLI.', file=sys.stderr)
print('Use Chrome browser settings to update profiles.', file=sys.stderr)
return 1
return _update_cloud_profile(args)
def _update_cloud_profile(args: argparse.Namespace) -> int:
"""Update a cloud profile."""
from browser_use.skill_cli.api_key import APIKeyRequired
try:
client = get_sdk_client()
params = {}
if args.name:
params['name'] = args.name
profile = client.profiles.update_profile(args.id, **params)
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps({'id': profile.id, 'name': profile.name}))
else:
print(f'Updated profile: {profile.id}')
return 0
# -----------------------------------------------------------------------------
# Delete profile
# -----------------------------------------------------------------------------
def _handle_delete(args: argparse.Namespace, mode: ProfileMode) -> int:
"""Handle 'profile delete <id>' command."""
if mode == 'real':
print('Error: Cannot delete local Chrome profiles via CLI.', file=sys.stderr)
print('Use Chrome browser settings to remove profiles.', file=sys.stderr)
return 1
return _delete_cloud_profile(args)
def _delete_cloud_profile(args: argparse.Namespace) -> int:
"""Delete a cloud profile."""
from browser_use.skill_cli.api_key import APIKeyRequired
try:
client = get_sdk_client()
client.profiles.delete_browser_profile(args.id)
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
if getattr(args, 'json', False):
print(json.dumps({'deleted': args.id}))
else:
print(f'Deleted profile: {args.id}')
return 0
# -----------------------------------------------------------------------------
# Cookies (local only)
# -----------------------------------------------------------------------------
def _handle_cookies(args: argparse.Namespace, mode: ProfileMode) -> int:
"""Handle 'profile cookies <id>' command."""
if mode == 'remote':
print('Error: Cookie listing is only available for local Chrome profiles.', file=sys.stderr)
print('Use -b real to access local profile cookies.', file=sys.stderr)
return 1
return _list_profile_cookies(args)
def _list_profile_cookies(args: argparse.Namespace) -> int:
"""List cookies by domain in a local Chrome profile."""
import asyncio
from browser_use.skill_cli.sessions import create_browser_session
# Get local profiles
local_profiles = list_local_chrome_profiles()
if not local_profiles:
print('Error: No local Chrome profiles found', file=sys.stderr)
return 1
# Find the matching profile
profile_arg = args.id
selected_profile = None
for p in local_profiles:
if p['id'] == profile_arg or p['name'] == profile_arg:
selected_profile = p
break
if not selected_profile:
print(f'Error: Profile "{profile_arg}" not found', file=sys.stderr)
print('Available profiles:')
for p in local_profiles:
print(f' {p["id"]}: {p["name"]}')
return 1
profile_id = selected_profile['id']
print(f'Loading cookies from: {selected_profile["name"]} ({selected_profile["email"]})')
async def get_cookies():
local_session = await create_browser_session('real', headed=False, profile=profile_id)
await local_session.start()
try:
cookies = await local_session._cdp_get_cookies()
return cookies
finally:
await local_session.kill()
try:
cookies = asyncio.get_event_loop().run_until_complete(get_cookies())
except RuntimeError:
cookies = asyncio.run(get_cookies())
# Group cookies by domain
domains: dict[str, int] = {}
for cookie in cookies:
domain = cookie.get('domain', 'unknown')
# Normalize domain (remove leading dot)
if domain.startswith('.'):
domain = domain[1:]
domains[domain] = domains.get(domain, 0) + 1
# Sort by count descending
sorted_domains = sorted(domains.items(), key=lambda x: x[1], reverse=True)
if getattr(args, 'json', False):
print(json.dumps({'domains': dict(sorted_domains), 'total_cookies': len(cookies)}))
else:
print(f'\nCookies by domain ({len(cookies)} total):')
for domain, count in sorted_domains[:20]: # Show top 20
print(f' {domain}: {count}')
if len(sorted_domains) > 20:
print(f' ... and {len(sorted_domains) - 20} more domains')
print('\nTo sync cookies to cloud:')
print(f' browser-use profile sync --from "{profile_id}" --domain <domain>')
return 0
# -----------------------------------------------------------------------------
# Sync (local → cloud)
# -----------------------------------------------------------------------------
def _handle_sync(args: argparse.Namespace) -> int:
"""Handle 'profile sync' command - sync local profile to cloud."""
import asyncio
from browser_use.skill_cli.api_key import APIKeyRequired
from browser_use.skill_cli.sessions import create_browser_session
# Get SDK client (validates API key)
try:
client = get_sdk_client()
except APIKeyRequired as e:
print(f'Error: {e}', file=sys.stderr)
return 1
except Exception as e:
print(f'Error: {e}', file=sys.stderr)
return 1
# Get local profiles
local_profiles = list_local_chrome_profiles()
if not local_profiles:
print('Error: No local Chrome profiles found', file=sys.stderr)
return 1
# Determine which profile to sync
from_profile = args.from_profile
if not from_profile:
# Show available profiles and ask user to specify
print('Available local profiles:')
for p in local_profiles:
print(f' {p["id"]}: {p["name"]} ({p["email"]})')
print()
print('Use --from to specify a profile:')
print(' browser-use profile sync --from "Default"')
print(' browser-use profile sync --from "Profile 1"')
return 1
# Find the matching profile
selected_profile = None
for p in local_profiles:
if p['id'] == from_profile or p['name'] == from_profile:
selected_profile = p
break
if not selected_profile:
print(f'Error: Profile "{from_profile}" not found', file=sys.stderr)
print('Available profiles:')
for p in local_profiles:
print(f' {p["id"]}: {p["name"]}')
return 1
profile_id = selected_profile['id']
profile_name = selected_profile['name']
domain_filter = getattr(args, 'domain', None)
# Generate cloud profile name
cloud_name = args.name if args.name else None
if not cloud_name:
if domain_filter:
cloud_name = f'Chrome - {profile_name} ({domain_filter})'
else:
cloud_name = f'Chrome - {profile_name}'
# Use stderr for progress when JSON output is requested
json_output = getattr(args, 'json', False)
out = sys.stderr if json_output else sys.stdout
def log(msg: str) -> None:
print(msg, file=out)
if domain_filter:
log(f'Syncing: {profile_name}{domain_filter} cookies only')
else:
log(f'Syncing: {profile_name} ({selected_profile["email"]})')
# Step 1: Create cloud profile
log(' Creating cloud profile...')
try:
cloud_profile = client.profiles.create_profile(name=cloud_name)
cloud_profile_id = cloud_profile.id
except Exception as e:
print(f'Error creating cloud profile: {e}', file=sys.stderr)
return 1
log(f' ✓ Created: {cloud_profile_id}')
def cleanup_cloud_profile() -> None:
"""Delete the cloud profile on failure."""
try:
client.profiles.delete_browser_profile(cloud_profile_id)
except Exception:
pass
# Step 2: Export cookies from local profile
async def sync_cookies():
log(' Exporting cookies from local profile...')
local_session = await create_browser_session('real', headed=False, profile=profile_id)
await local_session.start()
try:
cookies = await local_session._cdp_get_cookies()
if not cookies:
return 0, 'No cookies found in local profile'
# Filter by domain if specified
if domain_filter:
cookies = [c for c in cookies if domain_filter in c.get('domain', '')]
if not cookies:
return 0, f'No cookies found for domain: {domain_filter}'
log(f' ✓ Found {len(cookies)} cookies')
# Save to temp file - convert Cookie objects to dicts for JSON serialization
cookies_file = Path(tempfile.gettempdir()) / f'browser-use-sync-{cloud_profile_id}.json'
cookies_data = [dict(c) if hasattr(c, '__dict__') else c for c in cookies]
cookies_file.write_text(json.dumps(cookies_data))
return len(cookies), str(cookies_file)
finally:
await local_session.kill()
try:
loop = asyncio.get_event_loop()
if loop.is_running():
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, sync_cookies())
cookie_count, cookies_file = future.result()
else:
cookie_count, cookies_file = loop.run_until_complete(sync_cookies())
except RuntimeError:
cookie_count, cookies_file = asyncio.run(sync_cookies())
if cookie_count == 0:
log(f'{cookies_file}') # cookies_file contains error message
cleanup_cloud_profile()
return 1
# Step 3: Import cookies to cloud profile
async def import_to_cloud():
log(' Importing cookies to cloud profile...')
remote_session = await create_browser_session('remote', headed=False, profile=cloud_profile_id)
await remote_session.start()
try:
cookies = json.loads(Path(cookies_file).read_text())
await remote_session._cdp_set_cookies(cookies)
return True
finally:
await remote_session.kill()
try:
loop = asyncio.get_event_loop()
if loop.is_running():
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, import_to_cloud())
future.result()
else:
loop.run_until_complete(import_to_cloud())
except RuntimeError:
asyncio.run(import_to_cloud())
except Exception as e:
log(f' ⚠ Failed to import cookies: {e}')
cleanup_cloud_profile()
return 1
# Cleanup temp file
try:
Path(cookies_file).unlink()
except Exception:
pass
log('✓ Profile synced successfully!')
log(f' Cloud profile ID: {cloud_profile_id}')
log('')
log('To use this profile:')
log(f' browser-use -b remote --profile {cloud_profile_id} open <url>')
if json_output:
print(
json.dumps(
{
'success': True,
'profile_id': cloud_profile_id,
'cookies_synced': cookie_count,
}
)
)
return 0
# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
def list_local_chrome_profiles() -> list[dict[str, Any]]:
"""List local Chrome profiles from the Local State file."""
import platform
# Find Chrome Local State file
system = platform.system()
if system == 'Darwin':
local_state = Path.home() / 'Library/Application Support/Google/Chrome/Local State'
elif system == 'Windows':
local_state = Path.home() / 'AppData/Local/Google/Chrome/User Data/Local State'
else:
local_state = Path.home() / '.config/google-chrome/Local State'
if not local_state.exists():
return []
try:
data = json.loads(local_state.read_text())
profiles_info = data.get('profile', {}).get('info_cache', {})
profiles = []
for profile_id, info in profiles_info.items():
profiles.append(
{
'id': profile_id,
'name': info.get('name', profile_id),
'email': info.get('user_name', ''),
}
)
return profiles
except Exception:
return []

View File

@@ -49,7 +49,7 @@ async def handle(session: SessionInfo, params: dict[str, Any]) -> Any:
# Execute code in a thread pool so browser operations can schedule back to the event loop
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, python_session.execute, code, browser_session, loop)
result = await loop.run_in_executor(None, python_session.execute, code, browser_session, loop, session.actions)
if result.success:
# Return raw text output for clean display

View File

@@ -1,38 +0,0 @@
"""Session management command handlers."""
import logging
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from browser_use.skill_cli.sessions import SessionRegistry
logger = logging.getLogger(__name__)
COMMANDS = {'sessions', 'close'}
async def handle(action: str, session_name: str, registry: 'SessionRegistry', params: dict[str, Any]) -> Any:
"""Handle session management command."""
if action == 'sessions':
sessions = registry.list_sessions()
return {
'sessions': sessions,
'count': len(sessions),
}
elif action == 'close':
if params.get('all'):
# Close all sessions and signal shutdown
sessions = registry.list_sessions()
await registry.close_all()
return {
'closed': [s['name'] for s in sessions],
'count': len(sessions),
'_shutdown': True, # Signal to stop server
}
else:
# Close this server's session and shutdown
await registry.close_session(session_name)
return {'closed': session_name, '_shutdown': True}
raise ValueError(f'Unknown session action: {action}')

View File

@@ -1,330 +1,253 @@
"""Setup command - configure browser-use for first-time use.
"""Setup command — post-install setup for browser-use CLI.
Handles dependency installation and configuration with mode-based
setup (local/remote/full) and optional automatic fixes.
Covers everything install.sh does after the package is installed:
home directory, config file, Chromium, profile-use, cloudflared.
Interactive by default, --yes for CI.
"""
import logging
from typing import Any, Literal
logger = logging.getLogger(__name__)
COMMANDS = {'setup'}
import os
import shutil
import subprocess
import sys
from pathlib import Path
async def handle(
action: str,
params: dict[str, Any],
) -> dict[str, Any]:
"""Handle setup command."""
assert action == 'setup'
mode: Literal['local', 'remote', 'full'] = params.get('mode', 'local')
yes: bool = params.get('yes', False)
api_key: str | None = params.get('api_key')
json_output: bool = params.get('json', False)
# Validate mode
if mode not in ('local', 'remote', 'full'):
return {'error': f'Invalid mode: {mode}. Must be local, remote, or full'}
# Run setup flow
def _prompt(message: str, yes: bool) -> bool:
"""Prompt user for confirmation. Returns True if --yes or user says yes."""
if yes:
return True
try:
checks = await run_checks(mode)
if not json_output:
_log_checks(checks)
# Plan actions
actions = plan_actions(checks, mode, yes, api_key)
if not json_output:
_log_actions(actions)
# Execute actions
await execute_actions(actions, mode, api_key, json_output)
# Validate
validation = await validate_setup(mode)
if not json_output:
_log_validation(validation)
return {
'status': 'success',
'mode': mode,
'checks': checks,
'validation': validation,
}
except Exception as e:
logger.exception(f'Setup failed: {e}')
error_msg = str(e)
if json_output:
return {'error': error_msg}
return {'error': error_msg}
reply = input(f' {message} [Y/n] ').strip().lower()
return reply in ('', 'y', 'yes')
except (EOFError, KeyboardInterrupt):
print()
return False
async def run_checks(mode: Literal['local', 'remote', 'full']) -> dict[str, Any]:
"""Run pre-flight checks without making changes.
def handle(yes: bool = False) -> dict:
"""Run interactive setup."""
from browser_use.skill_cli.utils import get_home_dir
Returns:
Dict mapping check names to their status
"""
checks: dict[str, Any] = {}
home_dir = get_home_dir()
results: dict = {}
step = 0
total = 6
# Package check
try:
import browser_use
print('\nBrowser-Use Setup')
print('━━━━━━━━━━━━━━━━━\n')
checks['browser_use_package'] = {
'status': 'ok',
'message': f'browser-use {browser_use.__version__}'
if hasattr(browser_use, '__version__')
else 'browser-use installed',
}
except ImportError:
checks['browser_use_package'] = {
'status': 'error',
'message': 'browser-use not installed',
}
# Step 1: Home directory
step += 1
print(f'Step {step}/{total}: Home directory')
if home_dir.exists():
print(f'{home_dir} exists')
else:
home_dir.mkdir(parents=True, exist_ok=True)
print(f'{home_dir} created')
results['home_dir'] = 'ok'
# Browser check (local and full modes)
if mode in ('local', 'full'):
checks['browser'] = await _check_browser()
# API key check (remote and full modes)
if mode in ('remote', 'full'):
from browser_use.skill_cli.api_key import check_api_key
api_status = check_api_key()
if api_status['available']:
checks['api_key'] = {
'status': 'ok',
'message': f'Configured via {api_status["source"]} ({api_status["key_prefix"]}...)',
}
else:
checks['api_key'] = {
'status': 'missing',
'message': 'Not configured',
}
# Cloudflared check (remote and full modes)
if mode in ('remote', 'full'):
from browser_use.skill_cli.tunnel import get_tunnel_manager
tunnel_mgr = get_tunnel_manager()
status = tunnel_mgr.get_status()
checks['cloudflared'] = {
'status': 'ok' if status['available'] else 'missing',
'message': status['note'],
}
return checks
async def _check_browser() -> dict[str, Any]:
"""Check if browser is available."""
try:
from browser_use.browser.profile import BrowserProfile
profile = BrowserProfile(headless=True)
# Just check if we can create a session without actually launching
return {
'status': 'ok',
'message': 'Browser available',
}
except Exception as e:
return {
'status': 'error',
'message': f'Browser check failed: {e}',
}
def plan_actions(
checks: dict[str, Any],
mode: Literal['local', 'remote', 'full'],
yes: bool,
api_key: str | None,
) -> list[dict[str, Any]]:
"""Plan which actions to take based on checks.
Returns:
List of actions to execute
"""
actions: list[dict[str, Any]] = []
# Browser installation (local/full)
if mode in ('local', 'full'):
browser_check = checks.get('browser', {})
if browser_check.get('status') != 'ok':
actions.append(
{
'type': 'install_browser',
'description': 'Install browser (Chromium)',
'required': True,
}
)
# API key configuration (remote/full)
if mode in ('remote', 'full'):
api_check = checks.get('api_key', {})
if api_check.get('status') != 'ok':
if api_key:
actions.append(
{
'type': 'configure_api_key',
'description': 'Configure API key',
'required': True,
'api_key': api_key,
}
)
elif not yes:
actions.append(
{
'type': 'prompt_api_key',
'description': 'Prompt for API key',
'required': False,
}
)
# Cloudflared (remote/full)
if mode in ('remote', 'full'):
cloudflared_check = checks.get('cloudflared', {})
if cloudflared_check.get('status') != 'ok':
actions.append(
{
'type': 'install_cloudflared',
'description': 'Install cloudflared (for tunneling)',
'required': True,
}
)
return actions
async def execute_actions(
actions: list[dict[str, Any]],
mode: Literal['local', 'remote', 'full'],
api_key: str | None,
json_output: bool,
) -> None:
"""Execute planned actions.
Args:
actions: List of actions to execute
mode: Setup mode (local/remote/full)
api_key: Optional API key to configure
json_output: Whether to output JSON
"""
for action in actions:
action_type = action['type']
if action_type == 'install_browser':
if not json_output:
print('📦 Installing Chromium browser (~300MB)...')
# Browser will be installed on first use by Playwright
if not json_output:
print('✓ Browser available (will be installed on first use)')
elif action_type == 'configure_api_key':
if not json_output:
print('🔑 Configuring API key...')
from browser_use.skill_cli.api_key import save_api_key
if api_key:
save_api_key(api_key)
if not json_output:
print('✓ API key configured')
elif action_type == 'prompt_api_key':
if not json_output:
print('🔑 API key not configured')
print(' Set via: export BROWSER_USE_API_KEY=your_key')
print(' Or: browser-use setup --api-key <key>')
elif action_type == 'install_cloudflared':
if not json_output:
print('⚠ cloudflared not installed')
print(' Install via:')
print(' macOS: brew install cloudflared')
print(
' Linux: curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o ~/.local/bin/cloudflared && chmod +x ~/.local/bin/cloudflared'
)
print(' Windows: winget install Cloudflare.cloudflared')
print()
print(' Or re-run install.sh which installs cloudflared automatically.')
async def validate_setup(
mode: Literal['local', 'remote', 'full'],
) -> dict[str, Any]:
"""Validate that setup worked.
Returns:
Dict with validation results
"""
results: dict[str, Any] = {}
# Check imports
try:
import browser_use # noqa: F401
results['browser_use_import'] = 'ok'
except ImportError:
results['browser_use_import'] = 'failed'
# Validate mode requirements
if mode in ('local', 'full'):
# Step 2: Config file
step += 1
config_path = home_dir / 'config.json'
print(f'\nStep {step}/{total}: Config file')
if config_path.exists():
print(f'{config_path} exists')
else:
config_path.write_text('{}\n')
try:
from browser_use.browser.profile import BrowserProfile
config_path.chmod(0o600)
except OSError:
pass
print(f'{config_path} created')
results['config'] = 'ok'
browser_profile = BrowserProfile(headless=True)
results['browser_available'] = 'ok'
except Exception as e:
results['browser_available'] = f'failed: {e}'
# Step 3: Chromium browser
step += 1
print(f'\nStep {step}/{total}: Chromium browser')
chromium_installed = _check_chromium()
if chromium_installed:
print(' ✓ Chromium already installed')
results['chromium'] = 'ok'
else:
if _prompt('Chromium is not installed (~300MB download). Install now?', yes):
print(' Installing Chromium...')
if _install_chromium():
print(' ✓ Chromium installed')
results['chromium'] = 'ok'
else:
print(' ✗ Chromium installation failed')
results['chromium'] = 'failed'
else:
print(' ○ Skipped')
results['chromium'] = 'skipped'
if mode in ('remote', 'full'):
from browser_use.skill_cli.api_key import check_api_key
from browser_use.skill_cli.tunnel import get_tunnel_manager
# Step 4: Profile-use binary
step += 1
print(f'\nStep {step}/{total}: Profile-use binary')
from browser_use.skill_cli.profile_use import get_profile_use_binary
api_check = check_api_key()
results['api_key_available'] = api_check['available']
if get_profile_use_binary():
print(' ✓ profile-use already installed')
results['profile_use'] = 'ok'
else:
if _prompt('profile-use is not installed (needed for browser-use profile). Install now?', yes):
print(' Downloading profile-use...')
if _install_profile_use():
print(' ✓ profile-use installed')
results['profile_use'] = 'ok'
else:
print(' ✗ profile-use installation failed')
results['profile_use'] = 'failed'
else:
print(' ○ Skipped')
results['profile_use'] = 'skipped'
tunnel_mgr = get_tunnel_manager()
results['cloudflared_available'] = tunnel_mgr.is_available()
# Step 5: Cloudflared
step += 1
print(f'\nStep {step}/{total}: Cloudflare tunnel (cloudflared)')
if shutil.which('cloudflared'):
print(' ✓ cloudflared already installed')
results['cloudflared'] = 'ok'
else:
if _prompt('cloudflared is not installed (needed for browser-use tunnel). Install now?', yes):
print(' Installing cloudflared...')
if _install_cloudflared():
print(' ✓ cloudflared installed')
results['cloudflared'] = 'ok'
else:
print(' ✗ cloudflared installation failed')
results['cloudflared'] = 'failed'
else:
print(' ○ Skipped')
results['cloudflared'] = 'skipped'
# Step 6: Validation
step += 1
print(f'\nStep {step}/{total}: Validation')
from browser_use.skill_cli.config import CLI_DOCS_URL, get_config_display
# Quick checks
checks = {
'package': _check_package(),
'browser': 'ok' if _check_chromium() else 'missing',
'profile_use': 'ok' if get_profile_use_binary() else 'missing',
'cloudflared': 'ok' if shutil.which('cloudflared') else 'missing',
}
for name, status in checks.items():
icon = '' if status == 'ok' else ''
print(f' {icon} {name}: {status}')
# Config display
entries = get_config_display()
print(f'\nConfig ({config_path}):')
for entry in entries:
if entry['is_set']:
icon = ''
val = 'set' if entry['sensitive'] else entry['value']
else:
icon = ''
val = entry['value'] if entry['value'] else 'not set'
print(f' {icon} {entry["key"]}: {val}')
print(f' Docs: {CLI_DOCS_URL}')
print('\n━━━━━━━━━━━━━━━━━')
print('Setup complete! Next: browser-use open https://example.com\n')
results['status'] = 'success'
return results
def _log_checks(checks: dict[str, Any]) -> None:
"""Log check results."""
print('\n✓ Running checks...\n')
for name, check in checks.items():
status = check.get('status', 'unknown')
message = check.get('message', '')
icon = '' if status == 'ok' else '' if status == 'missing' else ''
print(f' {icon} {name.replace("_", " ")}: {message}')
print()
def _check_package() -> str:
"""Check if browser-use package is importable."""
try:
import browser_use
version = getattr(browser_use, '__version__', 'unknown')
return f'browser-use {version}'
except ImportError:
return 'not installed'
def _log_actions(actions: list[dict[str, Any]]) -> None:
"""Log planned actions."""
if not actions:
print('✓ No additional setup needed!\n')
return
def _check_chromium() -> bool:
"""Check if playwright chromium is installed."""
try:
from browser_use.browser.profile import BrowserProfile
print('\n📋 Setup actions:\n')
for i, action in enumerate(actions, 1):
required = '(required)' if action.get('required') else '(optional)'
print(f' {i}. {action["description"]} {required}')
print()
BrowserProfile(headless=True)
return True
except Exception:
return False
def _log_validation(validation: dict[str, Any]) -> None:
"""Log validation results."""
print('\n✓ Validation:\n')
for name, result in validation.items():
icon = '' if result == 'ok' else ''
print(f' {icon} {name.replace("_", " ")}: {result}')
print()
def _install_chromium() -> bool:
"""Install Chromium via playwright."""
try:
cmd = [sys.executable, '-m', 'playwright', 'install', 'chromium']
if sys.platform == 'linux':
cmd.append('--with-deps')
result = subprocess.run(cmd, timeout=300)
return result.returncode == 0
except Exception:
return False
def _install_profile_use() -> bool:
"""Download profile-use binary."""
try:
from browser_use.skill_cli.profile_use import download_profile_use
download_profile_use()
return True
except Exception:
return False
def _install_cloudflared() -> bool:
"""Install cloudflared."""
try:
if sys.platform == 'darwin':
result = subprocess.run(['brew', 'install', 'cloudflared'], timeout=120)
return result.returncode == 0
elif sys.platform == 'win32':
result = subprocess.run(['winget', 'install', 'Cloudflare.cloudflared'], timeout=120)
return result.returncode == 0
else:
# Linux: download binary + verify SHA256 checksum before installing
import hashlib
import platform
import shutil
import tempfile
import urllib.request
arch = 'arm64' if platform.machine() in ('aarch64', 'arm64') else 'amd64'
base_url = f'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-{arch}'
# Download to a temp file so we can verify before installing
with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as tmp:
tmp_path = Path(tmp.name)
try:
urllib.request.urlretrieve(base_url, tmp_path)
# Fetch checksum file published alongside the binary
with urllib.request.urlopen(f'{base_url}.sha256sum') as resp:
expected_sha256 = resp.read().decode().split()[0]
# Verify integrity before touching the install destination
actual_sha256 = hashlib.sha256(tmp_path.read_bytes()).hexdigest()
if actual_sha256 != expected_sha256:
raise RuntimeError(
f'cloudflared checksum mismatch — expected {expected_sha256}, got {actual_sha256}. '
'The download may be corrupt or tampered with.'
)
dest = Path('/usr/local/bin/cloudflared')
if not os.access('/usr/local/bin', os.W_OK):
dest = Path.home() / '.local' / 'bin' / 'cloudflared'
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(tmp_path), dest)
dest.chmod(0o755)
finally:
tmp_path.unlink(missing_ok=True)
return True
except Exception:
return False

View File

@@ -1,46 +0,0 @@
"""Shared utilities for CLI command handlers."""
from datetime import datetime, timezone
from browser_use_sdk import BrowserUse
_client: BrowserUse | None = None
def get_sdk_client() -> BrowserUse:
"""Get authenticated SDK client (singleton)."""
global _client
if _client is None:
from browser_use.skill_cli.api_key import require_api_key
api_key = require_api_key('Cloud API')
_client = BrowserUse(api_key=api_key)
return _client
def format_duration(started_at: datetime | None, finished_at: datetime | None) -> str:
"""Format duration between two timestamps, or elapsed time if still running."""
if not started_at:
return ''
try:
if finished_at:
end = finished_at
else:
end = datetime.now(timezone.utc)
delta = end - started_at
total_seconds = int(delta.total_seconds())
if total_seconds < 60:
return f'{total_seconds}s'
elif total_seconds < 3600:
minutes = total_seconds // 60
seconds = total_seconds % 60
return f'{minutes}m {seconds}s'
else:
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
return f'{hours}h {minutes}m'
except Exception:
return ''

View File

@@ -0,0 +1,171 @@
"""CLI configuration schema and helpers.
Single source of truth for all CLI config keys. Doctor, setup, and
getter functions all reference CONFIG_KEYS.
"""
import json
from pathlib import Path
CLI_DOCS_URL = 'https://docs.browser-use.com/open-source/browser-use-cli'
CONFIG_KEYS: dict = {
'api_key': {
'type': str,
'sensitive': True,
'description': 'Browser Use Cloud API key',
},
'cloud_connect_profile_id': {
'type': str,
'description': 'Cloud browser profile ID (auto-created)',
},
'cloud_connect_proxy': {
'type': str,
'default': 'us',
'description': 'Cloud proxy country code',
},
'cloud_connect_timeout': {
'type': int,
'description': 'Cloud browser timeout (minutes)',
},
'cloud_connect_recording': {
'type': bool,
'default': True,
'description': 'Enable session recording in cloud browser',
},
}
def _get_config_path() -> Path:
from browser_use.skill_cli.utils import get_config_path
return get_config_path()
def read_config() -> dict:
"""Read CLI config file. Returns empty dict if missing or corrupt."""
path = _get_config_path()
if path.exists():
try:
return json.loads(path.read_text())
except (json.JSONDecodeError, OSError):
return {}
return {}
def write_config(data: dict) -> None:
"""Write CLI config file with 0o600 permissions, atomically via tmp+rename.
Writing directly to config.json risks truncation if the process is killed
mid-write, which read_config() would silently treat as {} (empty config),
wiping the API key and all other settings.
"""
import os
import tempfile
path = _get_config_path()
path.parent.mkdir(parents=True, exist_ok=True)
content = json.dumps(data, indent=2) + '\n'
# Write to a temp file in the same directory so os.replace() is atomic
# (same filesystem guaranteed — cross-device rename raises OSError).
fd, tmp_str = tempfile.mkstemp(dir=path.parent, prefix='.config_tmp_')
tmp_path = Path(tmp_str)
try:
with os.fdopen(fd, 'w') as f:
f.write(content)
f.flush()
os.fsync(f.fileno())
try:
tmp_path.chmod(0o600)
except OSError:
pass
os.replace(tmp_path, path)
except Exception:
tmp_path.unlink(missing_ok=True)
raise
def get_config_value(key: str) -> str | int | None:
"""Read a config value, applying schema defaults.
Priority: config file → schema default → None.
"""
schema = CONFIG_KEYS.get(key)
if schema is None:
return None
config = read_config()
val = config.get(key)
if val is not None:
return val
return schema.get('default')
def set_config_value(key: str, value: str) -> None:
"""Set a config value. Validates key and coerces type."""
schema = CONFIG_KEYS.get(key)
if schema is None:
raise ValueError(f'Unknown config key: {key}. Valid keys: {", ".join(CONFIG_KEYS)}')
# Coerce type
expected_type = schema.get('type', str)
try:
if expected_type is int:
coerced = int(value)
elif expected_type is bool:
if value.lower() in ('true', '1', 'yes'):
coerced = True
elif value.lower() in ('false', '0', 'no'):
coerced = False
else:
raise ValueError(f'Invalid value for {key}: expected true/false, got {value!r}')
else:
coerced = str(value)
except (ValueError, TypeError):
raise ValueError(f'Invalid value for {key}: expected {expected_type.__name__}, got {value!r}')
config = read_config()
config[key] = coerced
write_config(config)
def unset_config_value(key: str) -> None:
"""Remove a config key from the file."""
schema = CONFIG_KEYS.get(key)
if schema is None:
raise ValueError(f'Unknown config key: {key}. Valid keys: {", ".join(CONFIG_KEYS)}')
config = read_config()
if key in config:
del config[key]
write_config(config)
def get_config_display() -> list[dict]:
"""Return config state for display (doctor, setup).
Each entry: {key, value, is_set, sensitive, description}
"""
config = read_config()
entries = []
for key, schema in CONFIG_KEYS.items():
val = config.get(key)
is_set = val is not None
# Apply default for display
display_val = val
if not is_set and 'default' in schema:
display_val = f'{schema["default"]} (default)'
entries.append(
{
'key': key,
'value': display_val,
'is_set': is_set,
'sensitive': schema.get('sensitive', False),
'description': schema.get('description', ''),
}
)
return entries

View File

@@ -0,0 +1,537 @@
"""Background daemon - keeps a single BrowserSession alive.
Each daemon owns one session, identified by a session name (default: 'default').
Isolation is per-session: each gets its own socket and PID file.
Auto-exits when browser dies (polls is_cdp_connected).
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import os
import signal
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from browser_use.skill_cli.sessions import SessionInfo
# Configure logging before imports
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
handlers=[logging.StreamHandler()],
)
logger = logging.getLogger('browser_use.skill_cli.daemon')
class Daemon:
"""Single-session daemon that manages a browser and handles CLI commands."""
def __init__(
self,
headed: bool,
profile: str | None,
cdp_url: str | None = None,
use_cloud: bool = False,
cloud_profile_id: str | None = None,
cloud_proxy_country_code: str | None = None,
cloud_timeout: int | None = None,
session: str = 'default',
) -> None:
from browser_use.skill_cli.utils import validate_session_name
validate_session_name(session)
self.session = session
self.headed = headed
self.profile = profile
self.cdp_url = cdp_url
self.use_cloud = use_cloud
self.cloud_profile_id = cloud_profile_id
self.cloud_proxy_country_code = cloud_proxy_country_code
self.cloud_timeout = cloud_timeout
self.running = True
self._server: asyncio.Server | None = None
self._shutdown_event = asyncio.Event()
self._session: SessionInfo | None = None
self._shutdown_task: asyncio.Task | None = None
self._browser_watchdog_task: asyncio.Task | None = None
self._session_lock = asyncio.Lock()
self._last_command_time: float = 0.0
self._idle_timeout: float = 30 * 60.0 # 30 minutes
self._idle_watchdog_task: asyncio.Task | None = None
self._is_shutting_down: bool = False
self._auth_token: str = ''
def _write_state(self, phase: str) -> None:
"""Atomically write session state file for CLI observability."""
import time
from browser_use.skill_cli.utils import get_home_dir
state = {
'phase': phase,
'pid': os.getpid(),
'updated_at': time.time(),
'config': {
'headed': self.headed,
'profile': self.profile,
'cdp_url': self.cdp_url,
'use_cloud': self.use_cloud,
},
}
state_path = get_home_dir() / f'{self.session}.state.json'
tmp_path = state_path.with_suffix('.state.json.tmp')
try:
with open(tmp_path, 'w') as f:
json.dump(state, f)
f.flush()
os.fsync(f.fileno())
os.replace(tmp_path, state_path)
except OSError as e:
logger.debug(f'Failed to write state file: {e}')
def _request_shutdown(self) -> None:
"""Request shutdown exactly once. Safe from any context."""
if self._is_shutting_down:
return
self._is_shutting_down = True
self._shutdown_task = asyncio.create_task(self._shutdown())
async def _get_or_create_session(self) -> SessionInfo:
"""Lazy-create the single session on first command."""
if self._session is not None:
return self._session
async with self._session_lock:
# Double-check after acquiring lock
if self._session is not None:
return self._session
from browser_use.skill_cli.sessions import SessionInfo, create_browser_session
logger.info(
f'Creating session (headed={self.headed}, profile={self.profile}, cdp_url={self.cdp_url}, use_cloud={self.use_cloud})'
)
self._write_state('starting')
bs = await create_browser_session(
self.headed,
self.profile,
self.cdp_url,
use_cloud=self.use_cloud,
cloud_profile_id=self.cloud_profile_id,
cloud_proxy_country_code=self.cloud_proxy_country_code,
cloud_timeout=self.cloud_timeout,
)
try:
await bs.start()
self._write_state('starting') # refresh updated_at after bs.start() returns
# Wait for Chrome to stabilize after CDP setup before accepting commands
try:
await bs.get_browser_state_summary()
except Exception:
pass
# Create action handler for direct command execution (no event bus)
from browser_use.skill_cli.actions import ActionHandler
actions = ActionHandler(bs)
self._session = SessionInfo(
name=self.session,
headed=self.headed,
profile=self.profile,
cdp_url=self.cdp_url,
browser_session=bs,
actions=actions,
use_cloud=self.use_cloud,
)
self._browser_watchdog_task = asyncio.create_task(self._watch_browser())
# Start idle timeout watchdog
self._idle_watchdog_task = asyncio.create_task(self._watch_idle())
except Exception:
# Startup failed — rollback browser resources
logger.exception('Session startup failed, rolling back')
self._write_state('failed')
try:
if self.use_cloud and hasattr(bs, '_cloud_browser_client') and bs._cloud_browser_client.current_session_id:
await asyncio.wait_for(bs._cloud_browser_client.stop_browser(), timeout=10.0)
elif not self.cdp_url and not self.use_cloud:
await asyncio.wait_for(bs.kill(), timeout=10.0)
else:
await asyncio.wait_for(bs.stop(), timeout=10.0)
except Exception as cleanup_err:
logger.debug(f'Rollback cleanup error: {cleanup_err}')
raise
self._write_state('running')
return self._session
async def _watch_browser(self) -> None:
"""Poll BrowserSession.is_cdp_connected every 2s. Shutdown when browser dies.
Skips checks while the BrowserSession is reconnecting. If reconnection fails,
next poll will see is_cdp_connected=False and trigger shutdown.
"""
while self.running:
await asyncio.sleep(2.0)
if not self._session:
continue
bs = self._session.browser_session
# Don't shut down while a reconnection attempt is in progress
if bs.is_reconnecting:
continue
if not bs.is_cdp_connected:
logger.info('Browser disconnected, shutting down daemon')
self._request_shutdown()
return
async def _watch_idle(self) -> None:
"""Shutdown daemon after idle_timeout seconds of no commands."""
while self.running:
await asyncio.sleep(60.0)
if self._last_command_time > 0:
import time
idle = time.monotonic() - self._last_command_time
if idle >= self._idle_timeout:
logger.info(f'Daemon idle for {idle:.0f}s, shutting down')
self._request_shutdown()
return
async def handle_connection(
self,
reader: asyncio.StreamReader,
writer: asyncio.StreamWriter,
) -> None:
"""Handle a single client request (one command per connection)."""
try:
line = await asyncio.wait_for(reader.readline(), timeout=300)
if not line:
return
request = {}
try:
import hmac
request = json.loads(line.decode())
req_id = request.get('id', '')
# Reject requests that don't carry the correct auth token.
# Use hmac.compare_digest to prevent timing-oracle attacks.
if self._auth_token and not hmac.compare_digest(
request.get('token', ''),
self._auth_token,
):
response = {'id': req_id, 'success': False, 'error': 'Unauthorized'}
else:
response = await self.dispatch(request)
except json.JSONDecodeError as e:
response = {'id': '', 'success': False, 'error': f'Invalid JSON: {e}'}
except Exception as e:
logger.exception(f'Error handling request: {e}')
response = {'id': '', 'success': False, 'error': str(e)}
writer.write((json.dumps(response) + '\n').encode())
await writer.drain()
if response.get('success') and request.get('action') == 'shutdown':
self._request_shutdown()
except TimeoutError:
logger.debug('Connection timeout')
except Exception as e:
logger.exception(f'Connection error: {e}')
finally:
writer.close()
try:
await writer.wait_closed()
except Exception:
pass
async def dispatch(self, request: dict) -> dict:
"""Route to command handlers."""
import time
self._last_command_time = time.monotonic()
action = request.get('action', '')
params = request.get('params', {})
req_id = request.get('id', '')
logger.info(f'Dispatch: {action} (id={req_id})')
try:
# Handle shutdown
if action == 'shutdown':
return {'id': req_id, 'success': True, 'data': {'shutdown': True}}
# Handle ping — returns daemon config for mismatch detection
if action == 'ping':
# Return live CDP URL (may differ from constructor arg for cloud sessions)
live_cdp_url = self.cdp_url
if self._session and self._session.browser_session.cdp_url:
live_cdp_url = self._session.browser_session.cdp_url
return {
'id': req_id,
'success': True,
'data': {
'session': self.session,
'pid': os.getpid(),
'headed': self.headed,
'profile': self.profile,
'cdp_url': live_cdp_url,
'use_cloud': self.use_cloud,
},
}
# Handle connect — forces immediate session creation (used by cloud connect)
if action == 'connect':
session = await self._get_or_create_session()
bs = session.browser_session
result_data: dict = {'status': 'connected'}
if bs.cdp_url:
result_data['cdp_url'] = bs.cdp_url
if self.use_cloud and bs.cdp_url:
from urllib.parse import quote
result_data['live_url'] = f'https://live.browser-use.com/?wss={quote(bs.cdp_url, safe="")}'
return {'id': req_id, 'success': True, 'data': result_data}
from browser_use.skill_cli.commands import browser, python_exec
# Get or create the single session
session = await self._get_or_create_session()
# Dispatch to handler
if action in browser.COMMANDS:
result = await browser.handle(action, session, params)
elif action == 'python':
result = await python_exec.handle(session, params)
else:
return {'id': req_id, 'success': False, 'error': f'Unknown action: {action}'}
return {'id': req_id, 'success': True, 'data': result}
except Exception as e:
logger.exception(f'Error dispatching {action}: {e}')
return {'id': req_id, 'success': False, 'error': str(e)}
async def run(self) -> None:
"""Listen on Unix socket (or TCP on Windows) with PID file.
Note: we do NOT unlink the socket in our finally block. If a replacement
daemon was spawned during our shutdown, it already bound a new socket at
the same path — unlinking here would delete *its* socket, orphaning it.
Stale sockets are cleaned up by is_daemon_alive() and by the next
daemon's startup (unlink before bind).
"""
import secrets
from browser_use.skill_cli.utils import get_auth_token_path, get_pid_path, get_socket_path
self._write_state('initializing')
# Generate and persist a per-session auth token.
# The client reads this file to authenticate its requests, preventing
# any other local process from sending commands to the daemon socket.
# Create the temp file with 0o600 at open() time to avoid a permission
# race window where the file exists but is not yet restricted.
# Raise on failure — running without a readable token file leaves the
# daemon permanently unauthorized for all clients.
self._auth_token = secrets.token_hex(32)
token_path = get_auth_token_path(self.session)
tmp_token = token_path.with_suffix('.token.tmp')
fd = os.open(str(tmp_token), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
try:
with os.fdopen(fd, 'w') as f:
f.write(self._auth_token)
except OSError:
try:
tmp_token.unlink(missing_ok=True)
except OSError:
pass
raise
os.replace(tmp_token, token_path)
# Setup signal handlers
loop = asyncio.get_running_loop()
def signal_handler():
self._request_shutdown()
for sig in (signal.SIGINT, signal.SIGTERM):
try:
loop.add_signal_handler(sig, signal_handler)
except NotImplementedError:
pass # Windows doesn't support add_signal_handler
if hasattr(signal, 'SIGHUP'):
try:
loop.add_signal_handler(signal.SIGHUP, signal_handler)
except NotImplementedError:
pass
sock_path = get_socket_path(self.session)
pid_path = get_pid_path(self.session)
logger.info(f'Session: {self.session}, Socket: {sock_path}')
if sock_path.startswith('tcp://'):
# Windows: TCP server
_, hostport = sock_path.split('://', 1)
host, port = hostport.split(':')
self._server = await asyncio.start_server(
self.handle_connection,
host,
int(port),
reuse_address=True,
)
logger.info(f'Listening on TCP {host}:{port}')
else:
# Unix: socket server
Path(sock_path).unlink(missing_ok=True)
self._server = await asyncio.start_unix_server(
self.handle_connection,
sock_path,
)
logger.info(f'Listening on Unix socket {sock_path}')
# Write PID file after server is bound
my_pid = str(os.getpid())
pid_path.write_text(my_pid)
self._write_state('ready')
try:
async with self._server:
await self._shutdown_event.wait()
# Wait for shutdown to finish browser cleanup before exiting
if self._shutdown_task:
await self._shutdown_task
except asyncio.CancelledError:
pass
finally:
# Conditionally delete PID file only if it still contains our PID
try:
if pid_path.read_text().strip() == my_pid:
pid_path.unlink(missing_ok=True)
except (OSError, ValueError):
pass
logger.info('Daemon stopped')
async def _shutdown(self) -> None:
"""Graceful shutdown. Only called via _request_shutdown().
Order matters: close the server first to release the socket/port
immediately, so a replacement daemon can bind without waiting for
browser cleanup. Then kill the browser session.
"""
logger.info('Shutting down daemon...')
self._write_state('shutting_down')
self.running = False
self._shutdown_event.set()
if self._browser_watchdog_task:
self._browser_watchdog_task.cancel()
if self._idle_watchdog_task:
self._idle_watchdog_task.cancel()
if self._server:
self._server.close()
if self._session:
try:
# Only kill the browser if the daemon launched it.
# For external connections (--connect, --cdp-url, cloud), just disconnect.
# Timeout ensures daemon exits even if CDP calls hang on a dead connection
if self.cdp_url or self.use_cloud:
await asyncio.wait_for(self._session.browser_session.stop(), timeout=10.0)
else:
await asyncio.wait_for(self._session.browser_session.kill(), timeout=10.0)
except TimeoutError:
logger.warning('Browser cleanup timed out after 10s, forcing exit')
except Exception as e:
logger.warning(f'Error closing session: {e}')
self._session = None
# Delete PID and auth token files last, right before exit.
import os
from browser_use.skill_cli.utils import get_auth_token_path, get_pid_path
pid_path = get_pid_path(self.session)
try:
if pid_path.exists() and pid_path.read_text().strip() == str(os.getpid()):
pid_path.unlink(missing_ok=True)
except (OSError, ValueError):
pass
get_auth_token_path(self.session).unlink(missing_ok=True)
self._write_state('stopped')
# Force exit — the asyncio server's __aexit__ hangs waiting for the
# handle_connection() call that triggered this shutdown to return.
logger.info('Daemon process exiting')
os._exit(0)
def main() -> None:
"""Main entry point for daemon process."""
parser = argparse.ArgumentParser(description='Browser-use daemon')
parser.add_argument('--session', default='default', help='Session name (default: "default")')
parser.add_argument('--headed', action='store_true', help='Show browser window')
parser.add_argument('--profile', help='Chrome profile (triggers real Chrome mode)')
parser.add_argument('--cdp-url', help='CDP URL to connect to')
parser.add_argument('--use-cloud', action='store_true', help='Use cloud browser')
parser.add_argument('--cloud-profile-id', help='Cloud browser profile ID')
parser.add_argument('--cloud-proxy-country', help='Cloud browser proxy country code')
parser.add_argument('--cloud-timeout', type=int, help='Cloud browser timeout in minutes')
args = parser.parse_args()
logger.info(
f'Starting daemon: session={args.session}, headed={args.headed}, profile={args.profile}, cdp_url={args.cdp_url}, use_cloud={args.use_cloud}'
)
daemon = Daemon(
headed=args.headed,
profile=args.profile,
cdp_url=args.cdp_url,
use_cloud=args.use_cloud,
cloud_profile_id=args.cloud_profile_id,
cloud_proxy_country_code=args.cloud_proxy_country,
cloud_timeout=args.cloud_timeout,
session=args.session,
)
exit_code = 0
try:
asyncio.run(daemon.run())
except KeyboardInterrupt:
logger.info('Interrupted')
except Exception as e:
logger.exception(f'Daemon error: {e}')
exit_code = 1
finally:
# Write failed state if we crashed without a clean shutdown
if not daemon._is_shutting_down:
try:
daemon._write_state('failed')
except Exception:
pass
# asyncio.run() may hang trying to cancel lingering tasks
# Force-exit to prevent the daemon from becoming an orphan
logger.info('Daemon process exiting')
os._exit(exit_code)
if __name__ == '__main__':
main()

View File

@@ -2,17 +2,8 @@
# Browser-Use Bootstrap Installer
#
# Usage:
# # Interactive install (shows mode selection TUI)
# curl -fsSL https://browser-use.com/cli/install.sh | bash
#
# # Non-interactive install with flags
# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full
# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only
# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --local-only
#
# # With API key
# curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --remote-only --api-key bu_xxx
#
# For development testing:
# curl -fsSL <raw-url> | BROWSER_USE_BRANCH=<branch-name> bash
#
@@ -24,7 +15,7 @@
# winget install Git.Git
#
# Then run from PowerShell:
# & "C:\Program Files\Git\bin\bash.exe" -c 'curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- --full'
# & "C:\Program Files\Git\bin\bash.exe" -c 'curl -fsSL https://browser-use.com/cli/install.sh | bash'
#
# KNOWN ISSUES AND SOLUTIONS:
#
@@ -76,10 +67,10 @@
# - Always kill stale processes before retrying
# - Or kill all Python: taskkill /IM python.exe /F
#
# 7. Debugging server issues
# To see actual error messages instead of "Failed to start session server":
# & "$env:USERPROFILE\.browser-use-env\Scripts\python.exe" -m browser_use.skill_cli.server --session default --browser chromium
# This runs the server in foreground and shows all errors.
# 7. Debugging daemon issues
# To see actual error messages instead of "Failed to start daemon":
# & "$env:USERPROFILE\.browser-use-env\Scripts\python.exe" -m browser_use.skill_cli.daemon
# This runs the daemon in foreground and shows all errors.
#
# =============================================================================
@@ -89,12 +80,6 @@ set -e
# Configuration
# =============================================================================
# Mode flags (set by parse_args or TUI)
INSTALL_LOCAL=false
INSTALL_REMOTE=false
SKIP_INTERACTIVE=false
API_KEY=""
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -130,43 +115,15 @@ log_error() {
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
--full|--all)
INSTALL_LOCAL=true
INSTALL_REMOTE=true
SKIP_INTERACTIVE=true
shift
;;
--remote-only)
INSTALL_REMOTE=true
SKIP_INTERACTIVE=true
shift
;;
--local-only)
INSTALL_LOCAL=true
SKIP_INTERACTIVE=true
shift
;;
--api-key)
if [ -z "$2" ] || [[ "$2" == --* ]]; then
log_error "--api-key requires a value"
exit 1
fi
API_KEY="$2"
shift 2
;;
--help|-h)
echo "Browser-Use Installer"
echo ""
echo "Usage: install.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --full, --all Install all modes (local + remote)"
echo " --remote-only Install remote mode only (no Chromium)"
echo " --local-only Install local modes only (no cloudflared)"
echo " --api-key KEY Set Browser-Use API key"
echo " --help, -h Show this help"
echo ""
echo "Without options, shows interactive mode selection."
echo "Installs Python 3.11+ (if needed), uv, browser-use, and Chromium."
exit 0
;;
*)
@@ -331,6 +288,10 @@ install_python() {
install_uv() {
log_info "Installing uv package manager..."
# Add common uv install locations to PATH for current session
# (covers both curl-based and Homebrew installs)
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if command -v uv &> /dev/null; then
log_success "uv already installed"
return 0
@@ -339,9 +300,6 @@ install_uv() {
# Use official uv installer
curl -LsSf https://astral.sh/uv/install.sh | sh
# Add common uv install locations to PATH for current session
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if command -v uv &> /dev/null; then
log_success "uv installed successfully"
else
@@ -350,121 +308,6 @@ install_uv() {
fi
}
# =============================================================================
# Gum TUI installation
# =============================================================================
install_gum() {
# Install gum for beautiful TUI - silent and fast
if command -v gum &> /dev/null; then
return 0
fi
local arch=$(uname -m)
local gum_version="0.14.5"
local gum_dir=""
mkdir -p "$HOME/.local/bin"
export PATH="$HOME/.local/bin:$PATH"
case "$PLATFORM" in
macos)
if [ "$arch" = "arm64" ]; then
gum_dir="gum_${gum_version}_Darwin_arm64"
curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Darwin_arm64.tar.gz" | tar -xz -C /tmp
else
gum_dir="gum_${gum_version}_Darwin_x86_64"
curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Darwin_x86_64.tar.gz" | tar -xz -C /tmp
fi
mv "/tmp/${gum_dir}/gum" "$HOME/.local/bin/" 2>/dev/null || return 1
rm -rf "/tmp/${gum_dir}" 2>/dev/null
;;
linux)
if [ "$arch" = "aarch64" ] || [ "$arch" = "arm64" ]; then
gum_dir="gum_${gum_version}_Linux_arm64"
curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Linux_arm64.tar.gz" | tar -xz -C /tmp
else
gum_dir="gum_${gum_version}_Linux_x86_64"
curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Linux_x86_64.tar.gz" | tar -xz -C /tmp
fi
mv "/tmp/${gum_dir}/gum" "$HOME/.local/bin/" 2>/dev/null || return 1
rm -rf "/tmp/${gum_dir}" 2>/dev/null
;;
windows)
# Download and extract Windows binary
curl -sL "https://github.com/charmbracelet/gum/releases/download/v${gum_version}/gum_${gum_version}_Windows_x86_64.zip" -o /tmp/gum.zip
unzip -q /tmp/gum.zip -d /tmp/gum_windows 2>/dev/null || return 1
# Binary is inside a subdirectory: gum_x.x.x_Windows_x86_64/gum.exe
mv "/tmp/gum_windows/gum_${gum_version}_Windows_x86_64/gum.exe" "$HOME/.local/bin/" 2>/dev/null || return 1
rm -rf /tmp/gum.zip /tmp/gum_windows 2>/dev/null
;;
*)
return 1
;;
esac
command -v gum &> /dev/null
}
# =============================================================================
# Interactive mode selection TUI
# =============================================================================
show_mode_menu() {
# Try to install gum for nice TUI
if install_gum; then
show_gum_menu
else
show_bash_menu
fi
}
show_gum_menu() {
echo ""
# Styled header
gum style --foreground 212 --bold "Select browser modes to install"
gum style --foreground 240 "Use arrow keys to navigate, space to select, enter to confirm"
echo ""
# Checkbox selection with gum choose
set +e
SELECTED=$(gum choose --no-limit --height 10 \
--cursor-prefix "[ ] " --selected-prefix "[✓] " --unselected-prefix "[ ] " \
--header "" \
--cursor.foreground 212 \
--selected.foreground 212 \
"Local browser (chromium/real - requires Chromium)" \
"Remote browser (cloud - requires API key)" < /dev/tty)
set -e
# Parse selections
if [[ "$SELECTED" == *"Local"* ]]; then INSTALL_LOCAL=true; fi
if [[ "$SELECTED" == *"Remote"* ]]; then INSTALL_REMOTE=true; fi
}
show_bash_menu() {
echo ""
echo "Select browser modes to install (space-separated numbers):"
echo ""
echo " 1) Local browser (chromium/real - requires Chromium download)"
echo " 2) Remote browser (cloud - requires API key)"
echo ""
echo "Press Enter for default [1]"
echo ""
echo -n "> "
# Read from /dev/tty to work even when script is piped
# Keep set +e for the whole function to avoid issues with pattern matching
set +e
read -r choices < /dev/tty
choices=${choices:-1}
if [[ "$choices" == *"1"* ]]; then INSTALL_LOCAL=true; fi
if [[ "$choices" == *"2"* ]]; then INSTALL_REMOTE=true; fi
set -e
}
# =============================================================================
# Browser-Use installation
# =============================================================================
@@ -515,119 +358,19 @@ install_chromium() {
log_success "Chromium installed"
}
install_cloudflared() {
log_info "Installing cloudflared..."
install_profile_use() {
log_info "Installing profile-use..."
if command -v cloudflared &> /dev/null; then
log_success "cloudflared already installed"
return 0
fi
mkdir -p "$HOME/.browser-use/bin"
curl -fsSL https://browser-use.com/profile/cli/install.sh | PROFILE_USE_VERSION=v1.0.2 INSTALL_DIR="$HOME/.browser-use/bin" sh
local arch=$(uname -m)
case "$PLATFORM" in
macos)
if command -v brew &> /dev/null; then
brew install cloudflared
else
# Direct download for macOS without Homebrew
mkdir -p "$HOME/.local/bin"
if [ "$arch" = "arm64" ]; then
curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-darwin-arm64.tgz -o /tmp/cloudflared.tgz
else
curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-darwin-amd64.tgz -o /tmp/cloudflared.tgz
fi
tar -xzf /tmp/cloudflared.tgz -C "$HOME/.local/bin/"
rm /tmp/cloudflared.tgz
fi
;;
linux)
mkdir -p "$HOME/.local/bin"
if [ "$arch" = "aarch64" ] || [ "$arch" = "arm64" ]; then
curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-arm64 -o "$HOME/.local/bin/cloudflared"
else
curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o "$HOME/.local/bin/cloudflared"
fi
chmod +x "$HOME/.local/bin/cloudflared"
;;
windows)
# Auto-install via winget (comes pre-installed on Windows 10/11)
if command -v winget.exe &> /dev/null; then
winget.exe install --id Cloudflare.cloudflared --accept-source-agreements --accept-package-agreements --silent
else
log_warn "winget not found. Install cloudflared manually:"
log_warn " Download from: https://github.com/cloudflare/cloudflared/releases"
return 0
fi
;;
esac
# Add ~/.local/bin to PATH for current session
export PATH="$HOME/.local/bin:$PATH"
if command -v cloudflared &> /dev/null; then
log_success "cloudflared installed successfully"
if [ -x "$HOME/.browser-use/bin/profile-use" ]; then
log_success "profile-use installed"
else
log_warn "cloudflared installation failed. You can install it manually later."
log_warn "profile-use installation failed (will auto-download on first use)"
fi
}
# =============================================================================
# Install dependencies based on selected modes
# =============================================================================
install_dependencies() {
# Install base package (always needed)
install_browser_use
# Install Chromium only if local mode selected
if [ "$INSTALL_LOCAL" = true ]; then
install_chromium
else
log_info "Skipping Chromium (remote-only mode)"
fi
# Install cloudflared only if remote mode selected
if [ "$INSTALL_REMOTE" = true ]; then
install_cloudflared
else
log_info "Skipping cloudflared (local-only mode)"
fi
}
# =============================================================================
# Write install configuration
# =============================================================================
write_install_config() {
# Determine installed modes and default
local modes=""
local default_mode=""
if [ "$INSTALL_LOCAL" = true ] && [ "$INSTALL_REMOTE" = true ]; then
modes='["chromium", "real", "remote"]'
default_mode="chromium"
elif [ "$INSTALL_REMOTE" = true ]; then
modes='["remote"]'
default_mode="remote"
else
modes='["chromium", "real"]'
default_mode="chromium"
fi
# Write config file
mkdir -p "$HOME/.browser-use"
cat > "$HOME/.browser-use/install-config.json" << EOF
{
"installed_modes": $modes,
"default_mode": "$default_mode"
}
EOF
local mode_names=$(echo $modes | tr -d '[]"' | tr ',' ' ')
log_success "Configured: $mode_names"
}
# =============================================================================
# PATH configuration
# =============================================================================
@@ -637,20 +380,19 @@ configure_path() {
local bin_path=$(get_venv_bin_dir)
local local_bin="$HOME/.local/bin"
# Detect shell
if [ -n "$BASH_VERSION" ]; then
shell_rc="$HOME/.bashrc"
elif [ -n "$ZSH_VERSION" ]; then
shell_rc="$HOME/.zshrc"
else
shell_rc="$HOME/.profile"
fi
# Detect user's login shell (not the running shell, since this script
# is typically executed via "curl ... | bash" which always sets BASH_VERSION)
case "$(basename "$SHELL")" in
zsh) shell_rc="$HOME/.zshrc" ;;
bash) shell_rc="$HOME/.bashrc" ;;
*) shell_rc="$HOME/.profile" ;;
esac
# Check if already in PATH (browser-use-env matches both /bin and /Scripts)
if grep -q "browser-use-env" "$shell_rc" 2>/dev/null; then
log_info "PATH already configured in $shell_rc"
else
# Add to shell config (includes ~/.local/bin for cloudflared)
# Add to shell config (includes ~/.local/bin for tools)
echo "" >> "$shell_rc"
echo "# Browser-Use" >> "$shell_rc"
echo "export PATH=\"$bin_path:$local_bin:\$PATH\"" >> "$shell_rc"
@@ -689,32 +431,6 @@ configure_powershell_path() {
fi
}
# =============================================================================
# Setup wizard
# =============================================================================
run_setup() {
log_info "Running setup wizard..."
# Activate venv
activate_venv
# Determine profile based on mode selections
local profile="local"
if [ "$INSTALL_REMOTE" = true ] && [ "$INSTALL_LOCAL" = true ]; then
profile="full"
elif [ "$INSTALL_REMOTE" = true ]; then
profile="remote"
fi
# Run setup with API key if provided
if [ -n "$API_KEY" ]; then
browser-use setup --mode "$profile" --api-key "$API_KEY" --yes
else
browser-use setup --mode "$profile" --yes
fi
}
# =============================================================================
# Validation
# =============================================================================
@@ -738,34 +454,18 @@ validate() {
# =============================================================================
print_next_steps() {
# Detect shell for source command
local shell_rc=".bashrc"
if [ -n "$ZSH_VERSION" ]; then
shell_rc=".zshrc"
fi
# Detect shell for source command (must match configure_path logic)
case "$(basename "$SHELL")" in
zsh) local shell_rc=".zshrc" ;;
bash) local shell_rc=".bashrc" ;;
*) local shell_rc=".profile" ;;
esac
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
log_success "Browser-Use installed successfully!"
echo ""
echo "Installed modes:"
[ "$INSTALL_LOCAL" = true ] && echo " ✓ Local (chromium, real)"
[ "$INSTALL_REMOTE" = true ] && echo " ✓ Remote (cloud)"
echo ""
# Show API key instructions if remote selected but no key provided
if [ "$INSTALL_REMOTE" = true ] && [ -z "$API_KEY" ]; then
echo "⚠ API key required for remote mode:"
if [ "$PLATFORM" = "windows" ]; then
echo " \$env:BROWSER_USE_API_KEY=\"<your-api-key>\""
else
echo " export BROWSER_USE_API_KEY=<your-api-key>"
fi
echo ""
echo " Get your API key at: https://browser-use.com"
echo ""
fi
echo "Next steps:"
if [ "$PLATFORM" = "windows" ]; then
@@ -773,13 +473,7 @@ print_next_steps() {
else
echo " 1. Restart your shell or run: source ~/$shell_rc"
fi
if [ "$INSTALL_REMOTE" = true ] && [ -z "$API_KEY" ]; then
echo " 2. Set your API key (see above)"
echo " 3. Try: browser-use open https://example.com"
else
echo " 2. Try: browser-use open https://example.com"
fi
echo " 2. Try: browser-use open https://example.com"
echo ""
echo "Documentation: https://docs.browser-use.com"
@@ -801,25 +495,13 @@ main() {
# Parse command-line flags
parse_args "$@"
# Show install mode if flags provided
if [ "$SKIP_INTERACTIVE" = true ]; then
if [ "$INSTALL_LOCAL" = true ] && [ "$INSTALL_REMOTE" = true ]; then
log_info "Install mode: full (local + remote)"
elif [ "$INSTALL_REMOTE" = true ]; then
log_info "Install mode: remote-only"
else
log_info "Install mode: local-only"
fi
echo ""
fi
# Step 1: Detect platform
detect_platform
# Step 2: Check/install Python
if ! check_python; then
# In CI or non-interactive mode (no tty), auto-install Python
if [ ! -t 0 ] || [ "$SKIP_INTERACTIVE" = true ]; then
if [ ! -t 0 ]; then
log_info "Python 3.11+ not found. Installing automatically..."
install_python
else
@@ -837,35 +519,29 @@ main() {
# Step 3: Install uv
install_uv
# Step 4: Show mode selection TUI (unless skipped via flags)
if [ "$SKIP_INTERACTIVE" = false ]; then
show_mode_menu
# Step 4: Install browser-use package
install_browser_use
# Step 5: Install Chromium
install_chromium
# Step 6: Install profile-use
install_profile_use
# Step 6.5: Create config.json if it doesn't exist
config_file="$HOME/.browser-use/config.json"
if [ ! -f "$config_file" ]; then
echo '{}' > "$config_file"
chmod 600 "$config_file"
fi
# Default to local-only if nothing selected
if [ "$INSTALL_LOCAL" = false ] && [ "$INSTALL_REMOTE" = false ]; then
log_warn "No modes selected, defaulting to local"
INSTALL_LOCAL=true
fi
echo ""
# Step 5: Install dependencies
install_dependencies
# Step 6: Write install config
write_install_config
# Step 7: Configure PATH
configure_path
# Step 8: Run setup wizard
run_setup
# Step 9: Validate
# Step 8: Validate
validate
# Step 10: Show next steps
# Step 9: Print next steps
print_next_steps
}

View File

@@ -1,111 +0,0 @@
"""Install configuration - tracks which browser modes are available.
This module manages the installation configuration that determines which browser
modes (chromium, real, remote) are available based on how browser-use was installed.
Config file: ~/.browser-use/install-config.json
When no config file exists (e.g., pip install users), all modes are available by default.
"""
import json
from pathlib import Path
from typing import Literal
CONFIG_PATH = Path.home() / '.browser-use' / 'install-config.json'
ModeType = Literal['chromium', 'real', 'remote']
# Local modes (both require Chromium to be installed)
LOCAL_MODES: set[str] = {'chromium', 'real'}
def get_config() -> dict:
"""Read install config. Returns default if not found.
Default config enables all modes (for pip install users).
"""
if not CONFIG_PATH.exists():
return {
'installed_modes': ['chromium', 'real', 'remote'],
'default_mode': 'chromium',
}
try:
return json.loads(CONFIG_PATH.read_text())
except (json.JSONDecodeError, OSError):
# Config file corrupt, return default
return {
'installed_modes': ['chromium', 'real', 'remote'],
'default_mode': 'chromium',
}
def save_config(installed_modes: list[str], default_mode: str) -> None:
"""Save install config."""
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
CONFIG_PATH.write_text(
json.dumps(
{
'installed_modes': installed_modes,
'default_mode': default_mode,
},
indent=2,
)
)
def is_mode_available(mode: str) -> bool:
"""Check if a browser mode is available based on installation config.
Args:
mode: The browser mode to check ('chromium', 'real', or 'remote')
Returns:
True if the mode is available, False otherwise
"""
config = get_config()
installed = config.get('installed_modes', [])
# Map 'real' to same category as 'chromium' (both are local)
# If either local mode is installed, both are available
if mode in LOCAL_MODES:
return bool(LOCAL_MODES & set(installed))
return mode in installed
def get_default_mode() -> str:
"""Get the default browser mode based on installation config."""
return get_config().get('default_mode', 'chromium')
def get_available_modes() -> list[str]:
"""Get list of available browser modes."""
return get_config().get('installed_modes', ['chromium', 'real', 'remote'])
def get_mode_unavailable_error(mode: str) -> str:
"""Generate a helpful error message when a mode is not available.
Args:
mode: The unavailable mode that was requested
Returns:
A formatted error message with instructions for reinstalling
"""
available = get_available_modes()
if mode in LOCAL_MODES:
install_flag = '--full'
mode_desc = 'Local browser mode'
else:
install_flag = '--full'
mode_desc = 'Remote browser mode'
return (
f"Error: {mode_desc} '{mode}' not installed.\n"
f'Available modes: {", ".join(available)}\n\n'
f'To install all modes, reinstall with:\n'
f' curl -fsSL https://browser-use.com/cli/install.sh | bash -s -- {install_flag}'
)

View File

@@ -0,0 +1,510 @@
#!/usr/bin/env bash
# Browser-Use Lightweight CLI Installer
#
# Installs only the minimal dependencies needed for the CLI (~10 packages
# instead of ~50). Use this if you only need the browser-use CLI commands
# and don't need the Python library (Agent, LLM integrations, etc.).
#
# Usage:
# curl -fsSL <url>/install_lite.sh | bash
#
# For development testing:
# curl -fsSL <raw-url> | BROWSER_USE_BRANCH=<branch-name> bash
#
# To install the full library instead, use install.sh.
#
# =============================================================================
set -e
# =============================================================================
# Prerequisites
# =============================================================================
if ! command -v curl &> /dev/null; then
echo "Error: curl is required but not installed."
echo "Install it and try again."
exit 1
fi
# =============================================================================
# Configuration
# =============================================================================
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m' # No Color
# =============================================================================
# Logging functions
# =============================================================================
log_info() {
echo -e "${BLUE}${NC} $1"
}
log_success() {
echo -e "${GREEN}${NC} $1"
}
log_warn() {
echo -e "${YELLOW}${NC} $1"
}
log_error() {
echo -e "${RED}${NC} $1"
}
# =============================================================================
# Argument parsing
# =============================================================================
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
--help|-h)
echo "Browser-Use Lightweight CLI Installer"
echo ""
echo "Usage: install_lite.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --help, -h Show this help"
echo ""
echo "Installs Python 3.11+ (if needed), uv, browser-use CLI (minimal deps), and Chromium."
exit 0
;;
*)
log_warn "Unknown argument: $1 (ignored)"
shift
;;
esac
done
}
# =============================================================================
# Platform detection
# =============================================================================
detect_platform() {
local os=$(uname -s | tr '[:upper:]' '[:lower:]')
local arch=$(uname -m)
case "$os" in
linux*)
PLATFORM="linux"
;;
darwin*)
PLATFORM="macos"
;;
msys*|mingw*|cygwin*)
PLATFORM="windows"
;;
*)
log_error "Unsupported OS: $os"
exit 1
;;
esac
log_info "Detected platform: $PLATFORM ($arch)"
}
# =============================================================================
# Virtual environment helpers
# =============================================================================
# Get the correct venv bin directory (Scripts on Windows, bin on Unix)
get_venv_bin_dir() {
if [ "$PLATFORM" = "windows" ]; then
echo "$HOME/.browser-use-env/Scripts"
else
echo "$HOME/.browser-use-env/bin"
fi
}
# Activate the virtual environment (handles Windows vs Unix paths)
activate_venv() {
local venv_bin=$(get_venv_bin_dir)
if [ -f "$venv_bin/activate" ]; then
source "$venv_bin/activate"
else
log_error "Virtual environment not found at $venv_bin"
exit 1
fi
}
# =============================================================================
# Python management
# =============================================================================
check_python() {
log_info "Checking Python installation..."
# Check versioned python commands first (python3.13, python3.12, python3.11)
# This handles Ubuntu/Debian where python3 may point to older version
# Also check common install locations directly in case PATH isn't updated
local py_candidates="python3.13 python3.12 python3.11 python3 python"
local py_paths="/usr/bin/python3.11 /usr/local/bin/python3.11"
for py_cmd in $py_candidates; do
if command -v "$py_cmd" &> /dev/null; then
local version=$($py_cmd --version 2>&1 | awk '{print $2}')
local major=$(echo $version | cut -d. -f1)
local minor=$(echo $version | cut -d. -f2)
if [ "$major" -ge 3 ] && [ "$minor" -ge 11 ]; then
PYTHON_CMD="$py_cmd"
log_success "Python $version found ($py_cmd)"
return 0
fi
fi
done
# Also check common paths directly (in case command -v doesn't find them)
for py_path in $py_paths; do
if [ -x "$py_path" ]; then
local version=$($py_path --version 2>&1 | awk '{print $2}')
local major=$(echo $version | cut -d. -f1)
local minor=$(echo $version | cut -d. -f2)
if [ "$major" -ge 3 ] && [ "$minor" -ge 11 ]; then
PYTHON_CMD="$py_path"
log_success "Python $version found ($py_path)"
return 0
fi
fi
done
# No suitable Python found
if command -v python3 &> /dev/null; then
local version=$(python3 --version 2>&1 | awk '{print $2}')
log_warn "Python $version found, but 3.11+ required"
else
log_warn "Python not found"
fi
return 1
}
install_python() {
log_info "Installing Python 3.11+..."
# Use sudo only if not root and sudo is available
SUDO=""
if [ "$(id -u)" -ne 0 ] && command -v sudo &> /dev/null; then
SUDO="sudo"
fi
case "$PLATFORM" in
macos)
if command -v brew &> /dev/null; then
brew install python@3.11
else
log_error "Homebrew not found. Install from: https://brew.sh"
exit 1
fi
;;
linux)
if command -v apt-get &> /dev/null; then
$SUDO apt-get update
$SUDO apt-get install -y python3.11 python3.11-venv python3-pip
elif command -v yum &> /dev/null; then
$SUDO yum install -y python311 python311-pip
else
log_error "Unsupported package manager. Install Python 3.11+ manually."
exit 1
fi
;;
windows)
log_error "Please install Python 3.11+ from: https://www.python.org/downloads/"
exit 1
;;
esac
# Verify installation
if check_python; then
log_success "Python installed successfully"
else
log_error "Python installation failed"
exit 1
fi
}
# =============================================================================
# uv package manager
# =============================================================================
install_uv() {
log_info "Installing uv package manager..."
# Add common uv install locations to PATH for current session
# (covers both curl-based and Homebrew installs)
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if command -v uv &> /dev/null; then
log_success "uv already installed"
return 0
fi
# Use official uv installer
if ! command -v curl &> /dev/null; then
log_error "curl is required but not found. Install curl and try again."
exit 1
fi
curl -LsSf https://astral.sh/uv/install.sh | sh
if command -v uv &> /dev/null; then
log_success "uv installed successfully"
else
log_error "uv installation failed. Try restarting your shell and run the installer again."
exit 1
fi
}
# =============================================================================
# Browser-Use installation (lightweight - CLI deps only)
# =============================================================================
install_browser_use() {
log_info "Installing browser-use (lightweight CLI)..."
# Create or use existing virtual environment
if [ ! -d "$HOME/.browser-use-env" ]; then
# Use discovered Python command (e.g., python3.11) or fall back to version spec
if [ -n "$PYTHON_CMD" ]; then
uv venv "$HOME/.browser-use-env" --python "$PYTHON_CMD"
else
uv venv "$HOME/.browser-use-env" --python 3.11
fi
fi
# Activate venv and install
activate_venv
# Install from GitHub (main branch by default, or custom branch for testing)
BROWSER_USE_BRANCH="${BROWSER_USE_BRANCH:-main}"
BROWSER_USE_REPO="${BROWSER_USE_REPO:-browser-use/browser-use}"
log_info "Installing from GitHub: $BROWSER_USE_REPO@$BROWSER_USE_BRANCH"
# Clone and install the package without its declared dependencies,
# then install only the minimal deps the CLI actually needs at runtime.
# This avoids pulling ~50 packages (LLM clients, PDF tools, etc.) that
# the CLI never imports.
local tmp_dir=$(mktemp -d)
git clone --depth 1 --branch "$BROWSER_USE_BRANCH" "https://github.com/$BROWSER_USE_REPO.git" "$tmp_dir"
uv pip install "$tmp_dir" --no-deps
# Install only the dependencies the CLI actually needs (~10 packages).
# The list lives in requirements-cli.txt so it's discoverable and testable.
# Transitive deps (e.g. websockets via cdp-use) are resolved automatically.
log_info "Installing minimal CLI dependencies..."
uv pip install -r "$tmp_dir/browser_use/skill_cli/requirements-cli.txt"
rm -rf "$tmp_dir"
log_success "browser-use CLI installed (lightweight)"
}
install_chromium() {
log_info "Installing Chromium browser..."
activate_venv
# Build command - only use --with-deps on Linux (it fails on Windows/macOS)
local cmd="uvx playwright install chromium"
if [ "$PLATFORM" = "linux" ]; then
cmd="$cmd --with-deps"
fi
cmd="$cmd --no-shell"
eval $cmd
log_success "Chromium installed"
}
install_profile_use() {
log_info "Installing profile-use..."
mkdir -p "$HOME/.browser-use/bin"
curl -fsSL https://browser-use.com/profile/cli/install.sh | PROFILE_USE_VERSION=v1.0.2 INSTALL_DIR="$HOME/.browser-use/bin" sh
if [ -x "$HOME/.browser-use/bin/profile-use" ]; then
log_success "profile-use installed"
else
log_warn "profile-use installation failed (will auto-download on first use)"
fi
}
# =============================================================================
# PATH configuration
# =============================================================================
configure_path() {
local shell_rc=""
local bin_path=$(get_venv_bin_dir)
local local_bin="$HOME/.local/bin"
# Detect shell
if [ -n "$BASH_VERSION" ]; then
shell_rc="$HOME/.bashrc"
elif [ -n "$ZSH_VERSION" ]; then
shell_rc="$HOME/.zshrc"
else
shell_rc="$HOME/.profile"
fi
# Check if already in PATH (browser-use-env matches both /bin and /Scripts)
if grep -q "browser-use-env" "$shell_rc" 2>/dev/null; then
log_info "PATH already configured in $shell_rc"
else
# Add to shell config (includes ~/.local/bin for tools)
echo "" >> "$shell_rc"
echo "# Browser-Use" >> "$shell_rc"
echo "export PATH=\"$bin_path:$local_bin:\$PATH\"" >> "$shell_rc"
log_success "Added to PATH in $shell_rc"
fi
# On Windows, also configure PowerShell profile
if [ "$PLATFORM" = "windows" ]; then
configure_powershell_path
fi
}
configure_powershell_path() {
# Use PowerShell to modify user PATH in registry (no execution policy needed)
# This persists across sessions without requiring profile script execution
local scripts_path='\\.browser-use-env\\Scripts'
local local_bin='\\.local\\bin'
# Check if already in user PATH
local current_path=$(powershell.exe -Command "[Environment]::GetEnvironmentVariable('Path', 'User')" 2>/dev/null | tr -d '\r')
if echo "$current_path" | grep -q "browser-use-env"; then
log_info "PATH already configured"
return 0
fi
# Append to user PATH via registry (safe, no truncation, no execution policy needed)
powershell.exe -Command "[Environment]::SetEnvironmentVariable('Path', [Environment]::GetEnvironmentVariable('Path', 'User') + ';' + \$env:USERPROFILE + '$scripts_path;' + \$env:USERPROFILE + '$local_bin', 'User')" 2>/dev/null
if [ $? -eq 0 ]; then
log_success "Added to Windows PATH: %USERPROFILE%\\.browser-use-env\\Scripts"
else
log_warn "Could not update PATH automatically. Add manually:"
log_warn " \$env:PATH += \";\$env:USERPROFILE\\.browser-use-env\\Scripts\""
fi
}
# =============================================================================
# Validation
# =============================================================================
validate() {
log_info "Validating installation..."
activate_venv
if browser-use doctor; then
log_success "Installation validated successfully!"
return 0
else
log_warn "Some checks failed. Run 'browser-use doctor' for details."
return 1
fi
}
# =============================================================================
# Print completion message
# =============================================================================
print_next_steps() {
# Detect shell for source command
local shell_rc=".bashrc"
if [ -n "$ZSH_VERSION" ]; then
shell_rc=".zshrc"
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
log_success "Browser-Use CLI installed successfully! (lightweight)"
echo ""
echo "Next steps:"
if [ "$PLATFORM" = "windows" ]; then
echo " 1. Restart PowerShell (PATH is now configured automatically)"
else
echo " 1. Restart your shell or run: source ~/$shell_rc"
fi
echo " 2. Try: browser-use open https://example.com"
echo ""
echo "To install the full library (Agent, LLMs, etc.):"
echo " uv pip install browser-use"
echo ""
echo "Documentation: https://docs.browser-use.com"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
}
# =============================================================================
# Main installation flow
# =============================================================================
main() {
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " Browser-Use Lightweight CLI Installer"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
# Parse command-line flags
parse_args "$@"
# Step 1: Detect platform
detect_platform
# Step 2: Check/install Python
if ! check_python; then
# In CI or non-interactive mode (no tty), auto-install Python
if [ ! -t 0 ]; then
log_info "Python 3.11+ not found. Installing automatically..."
install_python
else
read -p "Python 3.11+ not found. Install now? [y/N] " -n 1 -r < /dev/tty
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
install_python
else
log_error "Python 3.11+ required. Exiting."
exit 1
fi
fi
fi
# Step 3: Install uv
install_uv
# Step 4: Install browser-use package (minimal deps only)
install_browser_use
# Step 5: Install Chromium
install_chromium
# Step 6: Install profile-use
install_profile_use
# Step 7: Configure PATH
configure_path
# Step 8: Validate (non-fatal — warnings shouldn't block next-step instructions)
validate || true
# Step 9: Print next steps
print_next_steps
}
# Run main function with all arguments
main "$@"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,104 @@
"""Profile-use Go binary management.
Downloads, locates, and invokes the profile-use Go binary as a managed
subcommand of `browser-use profile`. The binary is always managed at
~/.browser-use/bin/profile-use — standalone installs on $PATH are independent.
"""
import os
import shutil
import subprocess
import sys
from pathlib import Path
def get_profile_use_binary() -> Path | None:
"""Return path to managed profile-use binary, or None if not installed."""
from browser_use.skill_cli.utils import get_bin_dir
binary = get_bin_dir() / ('profile-use.exe' if sys.platform == 'win32' else 'profile-use')
if binary.is_file() and os.access(str(binary), os.X_OK):
return binary
return None
def download_profile_use() -> Path:
"""Download profile-use binary via the official install script.
Runs: curl -fsSL https://browser-use.com/profile/cli/install.sh | sh
with INSTALL_DIR set to ~/.browser-use/bin/
Raises RuntimeError if download fails.
"""
from browser_use.skill_cli.utils import get_bin_dir
if not shutil.which('curl'):
raise RuntimeError(
'curl is required to download profile-use.\n'
'Install curl and try again, or install profile-use manually:\n'
' curl -fsSL https://browser-use.com/profile/cli/install.sh | sh'
)
bin_dir = get_bin_dir()
env = {**os.environ, 'INSTALL_DIR': str(bin_dir)}
result = subprocess.run(
['sh', '-c', 'curl -fsSL https://browser-use.com/profile/cli/install.sh | sh'],
env=env,
)
if result.returncode != 0:
raise RuntimeError(
'Failed to download profile-use. Try installing manually:\n curl -fsSL https://browser-use.com/profile/cli/install.sh | sh'
)
binary = get_profile_use_binary()
if binary is None:
raise RuntimeError('Download appeared to succeed but binary not found at expected location.')
return binary
def ensure_profile_use() -> Path:
"""Return path to profile-use binary, downloading if not present."""
binary = get_profile_use_binary()
if binary is not None:
return binary
print('profile-use not found, downloading...', file=sys.stderr)
return download_profile_use()
def run_profile_use(args: list[str]) -> int:
"""Execute profile-use with the given arguments.
Handles the 'update' subcommand specially by re-running the install script.
Passes BROWSER_USE_CONFIG_DIR so profile-use shares config with browser-use.
"""
# Handle 'update' subcommand — re-download latest binary
if args and args[0] == 'update':
try:
download_profile_use()
print('profile-use updated successfully')
return 0
except RuntimeError as e:
print(f'Error: {e}', file=sys.stderr)
return 1
try:
binary = ensure_profile_use()
except RuntimeError as e:
print(f'Error: {e}', file=sys.stderr)
return 1
from browser_use.skill_cli.utils import get_home_dir
env = {**os.environ, 'BROWSER_USE_CONFIG_DIR': str(get_home_dir())}
# Forward API key from config.json for profile-use binary
from browser_use.skill_cli.config import get_config_value
api_key = get_config_value('api_key')
if api_key:
env['BROWSER_USE_API_KEY'] = str(api_key)
return subprocess.call([str(binary)] + args, env=env)

View File

@@ -1,54 +0,0 @@
"""Wire protocol for CLI↔Server communication.
Uses JSON over Unix sockets (or TCP on Windows) with newline-delimited messages.
"""
import json
from dataclasses import asdict, dataclass, field
from typing import Any
@dataclass
class Request:
"""Command request from CLI to server."""
id: str
action: str
session: str
params: dict[str, Any] = field(default_factory=dict)
def to_json(self) -> str:
return json.dumps(asdict(self))
@classmethod
def from_json(cls, data: str) -> 'Request':
d = json.loads(data)
return cls(
id=d['id'],
action=d['action'],
session=d['session'],
params=d.get('params', {}),
)
@dataclass
class Response:
"""Response from server to CLI."""
id: str
success: bool
data: Any = None
error: str | None = None
def to_json(self) -> str:
return json.dumps(asdict(self))
@classmethod
def from_json(cls, data: str) -> 'Response':
d = json.loads(data)
return cls(
id=d['id'],
success=d['success'],
data=d.get('data'),
error=d.get('error'),
)

View File

@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, Literal
if TYPE_CHECKING:
from browser_use.browser.session import BrowserSession
from browser_use.skill_cli.actions import ActionHandler
@dataclass
@@ -48,7 +49,11 @@ class PythonSession:
)
def execute(
self, code: str, browser_session: 'BrowserSession', loop: asyncio.AbstractEventLoop | None = None
self,
code: str,
browser_session: 'BrowserSession',
loop: asyncio.AbstractEventLoop | None = None,
actions: 'ActionHandler | None' = None,
) -> ExecutionResult:
"""Execute code in persistent namespace.
@@ -59,10 +64,11 @@ class PythonSession:
code: Python code to execute
browser_session: The browser session for browser operations
loop: The event loop for async operations (required for browser access)
actions: Optional ActionHandler for direct execution (no event bus)
"""
# Inject browser wrapper with the event loop for async operations
if loop is not None:
self.namespace['browser'] = BrowserWrapper(browser_session, loop)
if loop is not None and actions is not None:
self.namespace['browser'] = BrowserWrapper(browser_session, loop, actions)
self.execution_count += 1
stdout = io.StringIO()
@@ -115,9 +121,10 @@ class BrowserWrapper:
Runs coroutines on the server's event loop using run_coroutine_threadsafe.
"""
def __init__(self, session: 'BrowserSession', loop: asyncio.AbstractEventLoop) -> None:
def __init__(self, session: 'BrowserSession', loop: asyncio.AbstractEventLoop, actions: 'ActionHandler') -> None:
self._session = session
self._loop = loop
self._actions = actions
def _run(self, coro: Any) -> Any:
"""Run coroutine on the server's event loop."""
@@ -147,21 +154,17 @@ class BrowserWrapper:
self._run(self._goto_async(url))
async def _goto_async(self, url: str) -> None:
from browser_use.browser.events import NavigateToUrlEvent
await self._session.event_bus.dispatch(NavigateToUrlEvent(url=url))
await self._actions.navigate(url)
def click(self, index: int) -> None:
"""Click element by index."""
self._run(self._click_async(index))
async def _click_async(self, index: int) -> None:
from browser_use.browser.events import ClickElementEvent
node = await self._session.get_element_by_index(index)
if node is None:
raise ValueError(f'Element index {index} not found')
await self._session.event_bus.dispatch(ClickElementEvent(node=node))
await self._actions.click_element(node)
def type(self, text: str) -> None:
"""Type text into focused element."""
@@ -181,22 +184,44 @@ class BrowserWrapper:
self._run(self._input_async(index, text))
async def _input_async(self, index: int, text: str) -> None:
from browser_use.browser.events import ClickElementEvent, TypeTextEvent
node = await self._session.get_element_by_index(index)
if node is None:
raise ValueError(f'Element index {index} not found')
await self._actions.click_element(node)
await self._actions.type_text(node, text)
def upload(self, index: int, path: str) -> None:
"""Upload a file to a file input element."""
self._run(self._upload_async(index, path))
async def _upload_async(self, index: int, path: str) -> None:
from pathlib import Path as P
file_path = str(P(path).expanduser().resolve())
p = P(file_path)
if not p.exists():
raise FileNotFoundError(f'File not found: {file_path}')
if not p.is_file():
raise ValueError(f'Not a file: {file_path}')
if p.stat().st_size == 0:
raise ValueError(f'File is empty (0 bytes): {file_path}')
node = await self._session.get_element_by_index(index)
if node is None:
raise ValueError(f'Element index {index} not found')
await self._session.event_bus.dispatch(ClickElementEvent(node=node))
await self._session.event_bus.dispatch(TypeTextEvent(node=node, text=text))
file_input_node = self._session.find_file_input_near_element(node)
if file_input_node is None:
raise ValueError(f'Element {index} is not a file input and no file input found nearby')
await self._actions.upload_file(file_input_node, file_path)
def scroll(self, direction: Literal['up', 'down', 'left', 'right'] = 'down', amount: int = 500) -> None:
"""Scroll the page."""
self._run(self._scroll_async(direction, amount))
async def _scroll_async(self, direction: Literal['up', 'down', 'left', 'right'], amount: int) -> None:
from browser_use.browser.events import ScrollEvent
await self._session.event_bus.dispatch(ScrollEvent(direction=direction, amount=amount))
await self._actions.scroll(direction, amount)
def screenshot(self, path: str | None = None) -> bytes:
"""Take screenshot, optionally save to file."""
@@ -233,18 +258,14 @@ class BrowserWrapper:
self._run(self._keys_async(keys))
async def _keys_async(self, keys: str) -> None:
from browser_use.browser.events import SendKeysEvent
await self._session.event_bus.dispatch(SendKeysEvent(keys=keys))
await self._actions.send_keys(keys)
def back(self) -> None:
"""Go back in history."""
self._run(self._back_async())
async def _back_async(self) -> None:
from browser_use.browser.events import GoBackEvent
await self._session.event_bus.dispatch(GoBackEvent())
await self._actions.go_back()
def wait(self, seconds: float) -> None:
"""Wait for specified seconds."""

View File

@@ -0,0 +1,12 @@
# Minimal dependencies for the browser-use CLI.
# Used by install_lite.sh — update this file if the CLI's import chain changes.
aiohttp==3.13.4
bubus==1.5.6
cdp-use==1.4.5
httpx==0.28.1
psutil==7.2.2
pydantic==2.12.5
pydantic-settings==2.12.0
python-dotenv==1.2.1
typing-extensions==4.15.0
uuid7==0.1.0

Some files were not shown because too many files have changed in this diff Show More