diff --git a/.github/workflows/cloud_evals.yml b/.github/workflows/cloud_evals.yml index 33d5f75c3..9dd97f482 100644 --- a/.github/workflows/cloud_evals.yml +++ b/.github/workflows/cloud_evals.yml @@ -16,6 +16,8 @@ on: description: Commit hash of the library to build the Cloud eval image for required: false +permissions: {} + jobs: trigger_cloud_eval_image_build: runs-on: ubuntu-latest diff --git a/.github/workflows/install-script.yml b/.github/workflows/install-script.yml index ccc3316fa..3ab8ed047 100644 --- a/.github/workflows/install-script.yml +++ b/.github/workflows/install-script.yml @@ -13,6 +13,9 @@ on: - '.github/workflows/install-script.yml' workflow_dispatch: +permissions: + contents: read + # Cancel in-progress runs when a new commit is pushed concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -26,16 +29,15 @@ env: jobs: # =========================================================================== - # Test install.sh with different modes on all platforms + # Test install.sh on all platforms # =========================================================================== test-install-sh-linux: - name: install.sh ${{ matrix.mode }} (Linux ${{ matrix.os }}) + name: install.sh (Linux ${{ matrix.os }}) strategy: fail-fast: false matrix: os: [ubuntu-latest, ubuntu-22.04] - mode: [--remote-only, --local-only, --full] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -45,8 +47,8 @@ jobs: with: python-version: '3.11' - - name: Run install.sh ${{ matrix.mode }} - run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }} + - name: Run install.sh + run: bash browser_use/skill_cli/install.sh - name: Add to PATH run: | @@ -58,65 +60,31 @@ jobs: source ~/.browser-use-env/bin/activate browser-use --help - - name: Verify install-config.json - run: | - cat ~/.browser-use/install-config.json - # Verify expected modes based on install flag - if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then - grep -q '"remote"' ~/.browser-use/install-config.json - grep -q '"default_mode": "remote"' ~/.browser-use/install-config.json - elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then - grep -q '"chromium"' ~/.browser-use/install-config.json - grep -q '"default_mode": "chromium"' ~/.browser-use/install-config.json - elif [[ "${{ matrix.mode }}" == "--full" ]]; then - grep -q '"chromium"' ~/.browser-use/install-config.json - grep -q '"remote"' ~/.browser-use/install-config.json - fi - - - name: Verify Chromium installed (local/full only) - if: matrix.mode != '--remote-only' + - name: Verify Chromium installed run: | source ~/.browser-use-env/bin/activate - # Check playwright browsers are installed - uvx playwright install --dry-run chromium 2>&1 | grep -i "chromium" || true # Verify chromium binary exists in playwright cache ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chrome 2>/dev/null || \ ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \ echo "Chromium binary check completed" - - name: Verify cloudflared installed (remote/full only) - if: matrix.mode != '--local-only' - run: | - which cloudflared || ls ~/.local/bin/cloudflared - cloudflared --version - - - name: Verify cloudflared NOT installed (local-only) - if: matrix.mode == '--local-only' - run: | - if command -v cloudflared &> /dev/null; then - echo "ERROR: cloudflared should not be installed in local-only mode" - exit 1 - fi - echo "Confirmed: cloudflared not installed (expected for local-only)" - - name: Run browser-use doctor run: | source ~/.browser-use-env/bin/activate browser-use doctor test-install-sh-macos: - name: install.sh ${{ matrix.mode }} (macOS ${{ matrix.os }}) + name: install.sh (macOS ${{ matrix.os }}) strategy: fail-fast: false matrix: os: [macos-latest, macos-14] - mode: [--remote-only, --local-only, --full] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - - name: Run install.sh ${{ matrix.mode }} - run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }} + - name: Run install.sh + run: bash browser_use/skill_cli/install.sh - name: Add to PATH run: | @@ -128,22 +96,7 @@ jobs: source ~/.browser-use-env/bin/activate browser-use --help - - name: Verify install-config.json - run: | - cat ~/.browser-use/install-config.json - if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then - grep -q '"remote"' ~/.browser-use/install-config.json - grep -q '"default_mode": "remote"' ~/.browser-use/install-config.json - elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then - grep -q '"chromium"' ~/.browser-use/install-config.json - grep -q '"default_mode": "chromium"' ~/.browser-use/install-config.json - elif [[ "${{ matrix.mode }}" == "--full" ]]; then - grep -q '"chromium"' ~/.browser-use/install-config.json - grep -q '"remote"' ~/.browser-use/install-config.json - fi - - - name: Verify Chromium installed (local/full only) - if: matrix.mode != '--remote-only' + - name: Verify Chromium installed run: | source ~/.browser-use-env/bin/activate # Check playwright cache for chromium @@ -151,32 +104,13 @@ jobs: ls ~/Library/Caches/ms-playwright/chromium-*/Chromium.app 2>/dev/null || \ echo "Chromium binary check completed" - - name: Verify cloudflared installed (remote/full only) - if: matrix.mode != '--local-only' - run: | - which cloudflared || ls ~/.local/bin/cloudflared - cloudflared --version - - - name: Verify cloudflared NOT installed (local-only) - if: matrix.mode == '--local-only' - run: | - if command -v cloudflared &> /dev/null; then - echo "ERROR: cloudflared should not be installed in local-only mode" - exit 1 - fi - echo "Confirmed: cloudflared not installed (expected for local-only)" - - name: Run browser-use doctor run: | source ~/.browser-use-env/bin/activate browser-use doctor test-install-sh-windows: - name: install.sh ${{ matrix.mode }} (Windows) - strategy: - fail-fast: false - matrix: - mode: [--remote-only, --local-only, --full] + name: install.sh (Windows) runs-on: windows-latest defaults: run: @@ -192,8 +126,8 @@ jobs: with: python-version: '3.11' - - name: Run install.sh ${{ matrix.mode }} - run: bash browser_use/skill_cli/install.sh ${{ matrix.mode }} + - name: Run install.sh + run: bash browser_use/skill_cli/install.sh - name: Add to PATH run: | @@ -205,18 +139,6 @@ jobs: source ~/.browser-use-env/Scripts/activate browser-use --help - - name: Verify install-config.json - run: | - cat ~/.browser-use/install-config.json - if [[ "${{ matrix.mode }}" == "--remote-only" ]]; then - grep -q '"remote"' ~/.browser-use/install-config.json - elif [[ "${{ matrix.mode }}" == "--local-only" ]]; then - grep -q '"chromium"' ~/.browser-use/install-config.json - elif [[ "${{ matrix.mode }}" == "--full" ]]; then - grep -q '"chromium"' ~/.browser-use/install-config.json - grep -q '"remote"' ~/.browser-use/install-config.json - fi - - name: Run browser-use doctor run: | source ~/.browser-use-env/Scripts/activate @@ -245,7 +167,7 @@ jobs: # Install from current branch uv pip install . - - name: Run browser-use install (installs Chromium only, not cloudflared) + - name: Run browser-use install (installs Chromium) run: | source .venv/bin/activate browser-use install @@ -262,9 +184,6 @@ jobs: ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \ echo "Chromium check completed" - # Note: browser-use install only installs Chromium, not cloudflared - # Users should install cloudflared separately if needed for tunneling - - name: Run browser-use doctor run: | source .venv/bin/activate @@ -295,7 +214,6 @@ jobs: - name: Test uvx with local wheel run: | - # Install the wheel we just built WHEEL=$(ls dist/*.whl) uvx --from "$WHEEL" browser-use --help @@ -310,8 +228,6 @@ jobs: ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \ echo "Chromium check completed" - # Note: browser-use install only installs Chromium, not cloudflared - - name: Test uvx browser-use doctor run: | WHEEL=$(ls dist/*.whl) @@ -345,7 +261,5 @@ jobs: ls ~/.cache/ms-playwright/chromium-*/chrome-linux/chromium 2>/dev/null || \ echo "Chromium check completed" - # Note: browser-use install only installs Chromium, not cloudflared - - name: Test uvx browser-use doctor run: uvx "browser-use[cli]" doctor diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c40046dee..af70548a8 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -16,6 +16,9 @@ on: pull_request: workflow_dispatch: +permissions: + contents: read + jobs: lint-syntax: name: syntax-errors @@ -35,7 +38,8 @@ jobs: - uses: astral-sh/setup-uv@v5 with: enable-cache: true - - run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors + - run: uv python install 3.11 + - run: uv sync --dev --all-extras --python 3.11 - run: uv run --no-sync pre-commit run --all-files --show-diff-on-failure lint-typecheck: diff --git a/.github/workflows/package.yaml b/.github/workflows/package.yaml index 981d783f9..cd9eb91af 100644 --- a/.github/workflows/package.yaml +++ b/.github/workflows/package.yaml @@ -15,6 +15,9 @@ on: - '*' workflow_dispatch: +permissions: + contents: read + jobs: build: name: pip-build diff --git a/.github/workflows/stale-bot.yml b/.github/workflows/stale-bot.yml index 779080e0e..ac943c73b 100644 --- a/.github/workflows/stale-bot.yml +++ b/.github/workflows/stale-bot.yml @@ -12,7 +12,7 @@ jobs: stale: runs-on: ubuntu-latest steps: - - uses: actions/stale@v9 + - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9 with: # General settings repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d3bb348bc..597c4344d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,6 @@ +default_language_version: + python: python3.11 + repos: - repo: https://github.com/asottile/yesqa rev: v1.5.0 diff --git a/AGENTS.md b/AGENTS.md index 4e370fa53..1d71f5d2e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -36,7 +36,7 @@ uv sync To get started with Browser Use you need to install the package and create an `.env` file with your API key. - `ChatBrowserUse` offers the [fastest and most cost-effective models](https://browser-use.com/posts/speed-matters/), completing tasks 3-5x faster. Get started with \$10 of [free LLM credits](https://cloud.browser-use.com/new-api-key). + `ChatBrowserUse` offers the [fastest and most cost-effective models](https://browser-use.com/posts/speed-matters/), completing tasks 3-5x faster. Get your API key at [cloud.browser-use.com](https://cloud.browser-use.com/new-api-key). ## 1. Installing Browser-Use @@ -61,7 +61,7 @@ uvx browser-use install Create a `.env` file and add your API key. - We recommend using ChatBrowserUse which is optimized for browser automation tasks (highest accuracy + fastest speed + lowest token cost). Don't have one? We give you **\$10** to try it out [here](https://cloud.browser-use.com/new-api-key). + We recommend using ChatBrowserUse which is optimized for browser automation tasks (highest accuracy + fastest speed + lowest token cost). Get your API key [here](https://cloud.browser-use.com/new-api-key). ```bash .env theme={null} @@ -76,7 +76,7 @@ Then add your API key to the file. ```bash Browser Use theme={null} # add your key to .env file BROWSER_USE_API_KEY= - # Get 10$ of free credits at https://cloud.browser-use.com/new-api-key + # Get your API key at https://cloud.browser-use.com/new-api-key ``` ```bash Google theme={null} @@ -256,7 +256,7 @@ Your cloud browser is already logged in! *** -For more sandbox parameters and events, see [Sandbox Quickstart](https://docs.browser-use.com/customize/sandbox/quickstart). +For more sandbox parameters and events, see [Sandbox Quickstart](https://docs.browser-use.com/legacy/sandbox/quickstart). # Agent Basics ```python theme={null} @@ -538,7 +538,7 @@ async def main(): > Complete reference for all browser configuration options - The `Browser` instance also provides all [Actor](https://docs.browser-use.com/customize/actor/all-parameters) methods for direct browser control (page management, element interactions, etc.). + The `Browser` instance also provides all [Actor](https://docs.browser-use.com/legacy/actor/all-parameters) methods for direct browser control (page management, element interactions, etc.). ## Core Settings @@ -798,7 +798,7 @@ The agent injects parameters by name matching, so using the wrong name will caus - Use `browser_session` parameter in tools for deterministic [Actor](https://docs.browser-use.com/customize/actor/basics) actions. + Use `browser_session` parameter in tools for deterministic [Actor](https://docs.browser-use.com/legacy/actor/basics) actions. diff --git a/CLOUD.md b/CLOUD.md index e4e3dac81..9e8ee0487 100644 --- a/CLOUD.md +++ b/CLOUD.md @@ -19,7 +19,7 @@ The key product of Browser Use Cloud is the completion of user tasks. - Profile Sync is the best way to handle authentication for tasks. This feature allows users to upload their local browser cookies (where the user is already logged into the services they need authentication for) to a Browser Profile that can be used for tasks on the cloud. To initiate a Profile Sync, a user must run `export BROWSER_USE_API_KEY= && curl -fsSL https://browser-use.com/profile.sh | sh` and follow the steps in the interactive terminal. ## Quickstart -To get started, direct the user to first must create an account, purchase credits (or simply claim the free starter credits given on account creation), and generate an API key on the Browser Use online platform: https://cloud.browser-use.com/. These are the only steps that can only be done on the platform. +To get started, direct the user to first must create an account, purchase credits (or simply claim the five free tasks given on account creation), and generate an API key on the Browser Use online platform: https://cloud.browser-use.com/. These are the only steps that can only be done on the platform. Avoid giving the user all of the following steps at once as it may seem overwheling. Instead present one step at a time and only continue when asked. Do as much for the user as you are able to. diff --git a/README.md b/README.md index 0c64a69be..0b2da4b6f 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@
-Browser-Use Package Download Statistics +Browser-Use Package Download Statistics
--- @@ -33,12 +33,12 @@ Discord -Browser-Use Cloud +Browser-Use Cloud
-🌤️ Want to skip the setup? Use our [cloud](https://cloud.browser-use.com) for faster, scalable, stealth-enabled browser automation! +🌤️ Want to skip the setup? Use our [cloud](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-skip-setup) for faster, scalable, stealth-enabled browser automation! # 🤖 LLM Quickstart @@ -49,77 +49,99 @@ # 👋 Human Quickstart -**1. Create environment with [uv](https://docs.astral.sh/uv/) (Python>=3.11):** +**1. Create environment and install Browser-Use with [uv](https://docs.astral.sh/uv/) (Python>=3.11):** ```bash -uv init +uv init && uv add browser-use && uv sync +# uvx browser-use install # Run if you don't have Chromium installed ``` -**2. Install Browser-Use package:** -```bash -# We ship every day - use the latest version! -uv add browser-use -uv sync -``` - -**3. Get your API key from [Browser Use Cloud](https://cloud.browser-use.com/new-api-key) and add it to your `.env` file (new signups get $10 free credits):** +**2. [Optional] Get your API key from [Browser Use Cloud](https://cloud.browser-use.com/new-api-key?utm_source=github&utm_medium=readme-quickstart-api-key):** ``` # .env BROWSER_USE_API_KEY=your-key +# GOOGLE_API_KEY=your-key +# ANTHROPIC_API_KEY=your-key ``` -**4. Install Chromium browser:** -```bash -uvx browser-use install -``` - -**5. Run your first agent:** +**3. Run your first agent:** ```python from browser_use import Agent, Browser, ChatBrowserUse +# from browser_use import ChatGoogle # ChatGoogle(model='gemini-3-flash-preview') +# from browser_use import ChatAnthropic # ChatAnthropic(model='claude-sonnet-4-6') import asyncio -async def example(): +async def main(): browser = Browser( - # use_cloud=True, # Uncomment to use a stealth browser on Browser Use Cloud + # use_cloud=True, # Use a stealth browser on Browser Use Cloud ) - llm = ChatBrowserUse() - agent = Agent( task="Find the number of stars of the browser-use repo", - llm=llm, + llm=ChatBrowserUse(), + # llm=ChatGoogle(model='gemini-3-flash-preview'), + # llm=ChatAnthropic(model='claude-sonnet-4-6'), browser=browser, ) - - history = await agent.run() - return history + await agent.run() if __name__ == "__main__": - history = asyncio.run(example()) + asyncio.run(main()) ``` -Check out the [library docs](https://docs.browser-use.com) and the [cloud docs](https://docs.cloud.browser-use.com) for more! +Check out the [library docs](https://docs.browser-use.com/open-source/introduction) and the [cloud docs](https://docs.cloud.browser-use.com?utm_source=github&utm_medium=readme-cloud-docs) for more!
-# 🔥 Deploy on Sandboxes +# Open Source vs Cloud -We handle agents, browsers, persistence, auth, cookies, and LLMs. The agent runs right next to the browser for minimal latency. + + + + BU Bench V1 - LLM Success Rates + -```python -from browser_use import Browser, sandbox, ChatBrowserUse -from browser_use.agent.service import Agent -import asyncio +We benchmark Browser Use across 100 real-world browser tasks. Full benchmark is open source: **[browser-use/benchmark](https://github.com/browser-use/benchmark)**. -@sandbox() -async def my_task(browser: Browser): - agent = Agent(task="Find the top HN post", browser=browser, llm=ChatBrowserUse()) - await agent.run() +**Use the Open-Source Agent** +- You need [custom tools](https://docs.browser-use.com/customize/tools/basics) or deep code-level integration +- We recommend pairing with our [cloud browsers](https://docs.browser-use.com/open-source/customize/browser/remote) for leading stealth, proxy rotation, and scaling +- Or self-host the open-source agent fully on your own machines -# Just call it like any async function -asyncio.run(my_task()) -``` +**Use the [Fully-Hosted Cloud Agent](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-hosted-agent) (recommended)** +- Much more powerful agent for complex tasks (see plot above) +- Easiest way to start and scale +- Best stealth with proxy rotation and captcha solving +- 1000+ integrations (Gmail, Slack, Notion, and more) +- Persistent filesystem and memory -See [Going to Production](https://docs.browser-use.com/production) for more details. +
+ +# Demos + + +### 📋 Form-Filling +#### Task = "Fill in this job application with my resume and information." +![Job Application Demo](https://github.com/user-attachments/assets/57865ee6-6004-49d5-b2c2-6dff39ec2ba9) +[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/apply_to_job.py) + + +### 🍎 Grocery-Shopping +#### Task = "Put this list of items into my instacart." + +https://github.com/user-attachments/assets/a6813fa7-4a7c-40a6-b4aa-382bf88b1850 + +[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/buy_groceries.py) + + +### 💻 Personal-Assistant. +#### Task = "Help me find parts for a custom PC." + +https://github.com/user-attachments/assets/ac34f75c-057a-43ef-ad06-5b2c9d42bf06 + +[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/pcpartpicker.py) + + +### 💡See [more examples here ↗](https://docs.browser-use.com/examples) and give us a star!
@@ -170,35 +192,6 @@ curl -o ~/.claude/skills/browser-use/SKILL.md \
-# Demos - - -### 📋 Form-Filling -#### Task = "Fill in this job application with my resume and information." -![Job Application Demo](https://github.com/user-attachments/assets/57865ee6-6004-49d5-b2c2-6dff39ec2ba9) -[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/apply_to_job.py) - - -### 🍎 Grocery-Shopping -#### Task = "Put this list of items into my instacart." - -https://github.com/user-attachments/assets/a6813fa7-4a7c-40a6-b4aa-382bf88b1850 - -[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/buy_groceries.py) - - -### 💻 Personal-Assistant. -#### Task = "Help me find parts for a custom PC." - -https://github.com/user-attachments/assets/ac34f75c-057a-43ef-ad06-5b2c9d42bf06 - -[Example code ↗](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/pcpartpicker.py) - - -### 💡See [more examples here ↗](https://docs.browser-use.com/examples) and give us a star! - -
- ## Integrations, hosting, custom tools, MCP, and more on our [Docs ↗](https://docs.browser-use.com)
@@ -218,6 +211,15 @@ We optimized **ChatBrowserUse()** specifically for browser automation tasks. On For other LLM providers, see our [supported models documentation](https://docs.browser-use.com/supported-models). +
+Should I use the Browser Use system prompt with the open-source preview model? + +Yes. If you use `ChatBrowserUse(model='browser-use/bu-30b-a3b-preview')` with a normal `Agent(...)`, Browser Use still sends its default agent system prompt for you. + +You do **not** need to add a separate custom "Browser Use system message" just because you switched to the open-source preview model. Only use `extend_system_message` or `override_system_message` when you intentionally want to customize the default behavior for your task. + +If you want the best default speed/accuracy, we still recommend the newer hosted `bu-*` models. If you want the open-source preview model, the setup stays the same apart from the `model=` value. +
Can I use custom tools with the agent? @@ -249,6 +251,12 @@ agent = Agent( Yes! Browser-Use is open source and free to use. You only need to choose an LLM provider (like OpenAI, Google, ChatBrowserUse, or run local models with Ollama).
+
+Terms of Service + +This open-source library is licensed under the MIT License. For Browser Use services & data policy, see our [Terms of Service](https://browser-use.com/legal/terms-of-service) and [Privacy Policy](https://browser-use.com/privacy/). +
+
How do I handle authentication? @@ -263,7 +271,7 @@ These examples show how to maintain sessions and handle authentication seamlessl
How do I solve CAPTCHAs? -For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [Browser Use Cloud](https://cloud.browser-use.com) which provides stealth browsers designed to avoid detection and CAPTCHA challenges. +For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [Browser Use Cloud](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-faq-captcha) which provides stealth browsers designed to avoid detection and CAPTCHA challenges.
@@ -271,7 +279,7 @@ For CAPTCHA handling, you need better browser fingerprinting and proxies. Use [B Chrome can consume a lot of memory, and running many agents in parallel can be tricky to manage. -For production use cases, use our [Browser Use Cloud API](https://cloud.browser-use.com) which handles: +For production use cases, use our [Browser Use Cloud API](https://cloud.browser-use.com?utm_source=github&utm_medium=readme-faq-production) which handles: - Scalable browser infrastructure - Memory management - Proxy rotation diff --git a/browser_use/__init__.py b/browser_use/__init__.py index d275a4f16..946ceba12 100644 --- a/browser_use/__init__.py +++ b/browser_use/__init__.py @@ -52,7 +52,6 @@ if TYPE_CHECKING: from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList from browser_use.browser import BrowserProfile, BrowserSession from browser_use.browser import BrowserSession as Browser - from browser_use.code_use.service import CodeAgent from browser_use.dom.service import DomService from browser_use.llm import models from browser_use.llm.anthropic.chat import ChatAnthropic @@ -60,6 +59,7 @@ if TYPE_CHECKING: from browser_use.llm.browser_use.chat import ChatBrowserUse from browser_use.llm.google.chat import ChatGoogle from browser_use.llm.groq.chat import ChatGroq + from browser_use.llm.litellm.chat import ChatLiteLLM from browser_use.llm.mistral.chat import ChatMistral from browser_use.llm.oci_raw.chat import ChatOCIRaw from browser_use.llm.ollama.chat import ChatOllama @@ -72,8 +72,6 @@ if TYPE_CHECKING: _LAZY_IMPORTS = { # Agent service (heavy due to dependencies) # 'Agent': ('browser_use.agent.service', 'Agent'), - # Code-use agent (Jupyter notebook-like execution) - 'CodeAgent': ('browser_use.code_use.service', 'CodeAgent'), 'Agent': ('browser_use.agent.service', 'Agent'), # System prompt (moderate weight due to agent.views imports) 'SystemPrompt': ('browser_use.agent.prompts', 'SystemPrompt'), @@ -95,6 +93,7 @@ _LAZY_IMPORTS = { 'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'), 'ChatBrowserUse': ('browser_use.llm.browser_use.chat', 'ChatBrowserUse'), 'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'), + 'ChatLiteLLM': ('browser_use.llm.litellm.chat', 'ChatLiteLLM'), 'ChatMistral': ('browser_use.llm.mistral.chat', 'ChatMistral'), 'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'), 'ChatOCIRaw': ('browser_use.llm.oci_raw.chat', 'ChatOCIRaw'), @@ -131,8 +130,6 @@ def __getattr__(name: str): __all__ = [ 'Agent', - 'CodeAgent', - # 'CodeAgent', 'BrowserSession', 'Browser', # Alias for BrowserSession 'BrowserProfile', @@ -148,6 +145,7 @@ __all__ = [ 'ChatAnthropic', 'ChatBrowserUse', 'ChatGroq', + 'ChatLiteLLM', 'ChatMistral', 'ChatAzureOpenAI', 'ChatOCIRaw', diff --git a/browser_use/agent/cloud_events.py b/browser_use/agent/cloud_events.py index ed7b3c4b3..43142f8b1 100644 --- a/browser_use/agent/cloud_events.py +++ b/browser_use/agent/cloud_events.py @@ -8,7 +8,7 @@ from bubus import BaseEvent from pydantic import Field, field_validator from uuid_extensions import uuid7str -MAX_STRING_LENGTH = 100000 # 100K chars ~ 25k tokens should be enough +MAX_STRING_LENGTH = 500000 # 100K chars ~ 25k tokens should be enough MAX_URL_LENGTH = 100000 MAX_TASK_LENGTH = 100000 MAX_COMMENT_LENGTH = 2000 @@ -38,6 +38,8 @@ class UpdateAgentTaskEvent(BaseEvent): raise ValueError('Agent must have _task_start_time attribute') done_output = agent.history.final_result() if agent.history else None + if done_output and len(done_output) > MAX_STRING_LENGTH: + done_output = done_output[:MAX_STRING_LENGTH] return cls( id=str(agent.task_id), user_id='', # To be filled by cloud handler diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py index 6bbf0b86f..eaf5b091c 100644 --- a/browser_use/agent/gif.py +++ b/browser_use/agent/gif.py @@ -108,7 +108,6 @@ def create_history_gif( font_name = os.path.join(CONFIG.WIN_FONT_DIR, font_name + '.ttf') regular_font = ImageFont.truetype(font_name, font_size) title_font = ImageFont.truetype(font_name, title_font_size) - goal_font = ImageFont.truetype(font_name, goal_font_size) font_loaded = True break except OSError: @@ -121,8 +120,6 @@ def create_history_gif( regular_font = ImageFont.load_default() title_font = ImageFont.load_default() - goal_font = regular_font - # Load logo if requested logo = None if show_logo: @@ -236,8 +233,6 @@ def _create_task_frame( # Start with base font size (regular + 16) base_font_size = regular_font.size + 16 min_font_size = max(regular_font.size - 10, 16) # Don't go below 16pt - max_font_size = base_font_size # Cap at the base font size - # Calculate dynamic font size based on text length and complexity # Longer texts get progressively smaller fonts text_length = len(task) diff --git a/browser_use/agent/judge.py b/browser_use/agent/judge.py index 3d840ef44..d17232721 100644 --- a/browser_use/agent/judge.py +++ b/browser_use/agent/judge.py @@ -88,6 +88,8 @@ def construct_judge_messages( ) ) + current_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC') + # System prompt for judge - conditionally add ground truth section ground_truth_section = '' if ground_truth: @@ -168,7 +170,7 @@ Set `reached_captcha` to true if: - **evaluate for action** - For each key step of the trace, double check whether the action that the agent tried to performed actually happened. If the required action did not actually occur, the verdict should be false. - **screenshot is not entire content** - The agent has the entire DOM content, but the screenshot is only part of the content. If the agent extracts information from the page, but you do not see it in the screenshot, you can assume this information is there. - **Penalize poor tool usage** - Wrong tools, inefficient approaches, ignoring available information. -- **ignore unexpected dates and times** - These agent traces are from varying dates, you can assume the dates the agent uses for search or filtering are correct. +- **current date/time is {current_date}** - content with recent dates is real, not fabricated. - **IMPORTANT**: be very picky about the user's request - Have very high standard for the agent completing the task exactly to the user's request. - **IMPORTANT**: be initially doubtful of the agent's self reported success, be sure to verify that its methods are valid and fulfill the user's desires to a tee. @@ -221,54 +223,3 @@ Evaluate this agent execution given the criteria and respond with the exact JSON SystemMessage(content=system_prompt), UserMessage(content=content_parts), ] - - -def construct_simple_judge_messages( - task: str, - final_result: str, -) -> list[BaseMessage]: - """Construct lightweight judge messages to validate agent success claims. - - Always runs regardless of use_judge setting. Text-only — no screenshots, - no trajectory. Just task + final result. - """ - task_truncated = _truncate_text(task, 20000) - final_result_truncated = _truncate_text(final_result, 20000) - - current_date = datetime.now(timezone.utc).strftime('%Y-%m-%d') - - system_prompt = f"""You are a strict verifier checking whether a browser automation agent actually completed its task. - -Today's date is {current_date}. The agent ran recently — dates near today are expected and NOT fabricated. - -Given the task and the agent's final response, determine if the response genuinely satisfies ALL requirements. - -Check for these common failure patterns: -1. **Incorrect data**: Wrong number of items, missing filters/criteria, wrong format -2. **Unverified actions**: Agent claims to have submitted a form, posted a comment, or saved a file but there's no evidence -3. **Incomplete results**: Some requirements from the task are not addressed in the response -4. **Fabricated content**: Data that looks plausible but wasn't actually extracted from any page. NOTE: dates and times close to today's date ({current_date}) are NOT fabricated — the agent browses live websites and extracts real-time content. -5. **Partial completion reported as success**: Response acknowledges failure or blockers (captcha, access denied, etc.) but still claims success - -Respond with EXACTLY this JSON structure: -{{ - "is_correct": true or false, - "reason": "Brief explanation if not correct, empty string if correct" -}} - -Be strict: if the response doesn't clearly satisfy every requirement, set is_correct to false.""" - - user_prompt = f""" -{task_truncated or 'No task provided'} - - - -{final_result_truncated or 'No response provided'} - - -Does the agent's response fully satisfy all requirements of the task? Respond with the JSON structure.""" - - return [ - SystemMessage(content=system_prompt), - UserMessage(content=user_prompt), - ] diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index 5e231c942..6c7cae11a 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -25,7 +25,12 @@ from browser_use.llm.messages import ( UserMessage, ) from browser_use.observability import observe_debug -from browser_use.utils import match_url_with_domain_pattern, time_execution_sync +from browser_use.utils import ( + collect_sensitive_data_values, + match_url_with_domain_pattern, + redact_sensitive_string, + time_execution_sync, +) logger = logging.getLogger(__name__) @@ -114,6 +119,7 @@ class MessageManager: include_recent_events: bool = False, sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None, llm_screenshot_size: tuple[int, int] | None = None, + max_clickable_elements_length: int = 40000, ): self.task = task self.state = state @@ -127,6 +133,7 @@ class MessageManager: self.include_recent_events = include_recent_events self.sample_images = sample_images self.llm_screenshot_size = llm_screenshot_size + self.max_clickable_elements_length = max_clickable_elements_length assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5' @@ -144,7 +151,13 @@ class MessageManager: """Build agent history description from list of items, respecting max_history_items limit""" compacted_prefix = '' if self.state.compacted_memory: - compacted_prefix = f'\n{self.state.compacted_memory}\n\n' + compacted_prefix = ( + '\n' + '\n' + f'{self.state.compacted_memory}\n' + '\n' + ) if self.max_history_items is None: # Include all items @@ -247,6 +260,9 @@ class MessageManager: 'You are summarizing an agent run for prompt compaction.\n' 'Capture task requirements, key facts, decisions, partial progress, errors, and next steps.\n' 'Preserve important entities, values, URLs, and file paths.\n' + 'CRITICAL: Only mark a step as completed if you see explicit success confirmation in the history. ' + 'If a step was started but not explicitly confirmed complete, mark it as "IN-PROGRESS". ' + 'Never infer completion from context — only report what was confirmed.\n' 'Return plain text only. Do not include tool calls or JSON.' ) if settings.summary_max_chars: @@ -298,7 +314,6 @@ class MessageManager: self.state.read_state_images = [] # Clear images from previous step action_results = '' - result_len = len(result) read_state_idx = 0 for idx, action_result in enumerate(result): @@ -470,6 +485,7 @@ class MessageManager: include_attributes=self.include_attributes, step_info=step_info, page_filtered_actions=page_filtered_actions, + max_clickable_elements_length=self.max_clickable_elements_length, sensitive_data=self.sensitive_data_description, available_file_paths=available_file_paths, screenshots=screenshots, @@ -562,30 +578,14 @@ class MessageManager: if not self.sensitive_data: return value - # Collect all sensitive values, immediately converting old format to new format - sensitive_values: dict[str, str] = {} - - # Process all sensitive data entries - for key_or_domain, content in self.sensitive_data.items(): - if isinstance(content, dict): - # Already in new format: {domain: {key: value}} - for key, val in content.items(): - if val: # Skip empty values - sensitive_values[key] = val - elif content: # Old format: {key: value} - convert to new format internally - # We treat this as if it was {'http*://*': {key_or_domain: content}} - sensitive_values[key_or_domain] = content + sensitive_values = collect_sensitive_data_values(self.sensitive_data) # If there are no valid sensitive data entries, just return the original value if not sensitive_values: logger.warning('No valid entries found in sensitive_data dictionary') return value - # Replace all valid sensitive data values with their placeholder tags - for key, val in sensitive_values.items(): - value = value.replace(val, f'{key}') - - return value + return redact_sensitive_string(value, sensitive_values) if isinstance(message.content, str): message.content = replace_sensitive(message.content) diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py index 7ce050ac5..803b3547d 100644 --- a/browser_use/agent/prompts.py +++ b/browser_use/agent/prompts.py @@ -157,6 +157,7 @@ class AgentMessagePrompt: 'images': 0, 'interactive_elements': 0, 'total_elements': 0, + 'text_chars': 0, } if not self.browser_state.dom_state or not self.browser_state.dom_state._root: @@ -203,6 +204,9 @@ class AgentMessagePrompt: else: stats['shadow_open'] += 1 + elif original.node_type == NodeType.TEXT_NODE: + stats['text_chars'] += len(original.node_value.strip()) + elif original.node_type == NodeType.DOCUMENT_FRAGMENT_NODE: # Shadow DOM fragment - these are the actual shadow roots # But don't double-count since we count them at the host level above @@ -224,6 +228,9 @@ class AgentMessagePrompt: stats_text = '' if page_stats['total_elements'] < 10: stats_text += 'Page appears empty (SPA not loaded?) - ' + # Skeleton screen: many elements but almost no text = loading placeholders + elif page_stats['total_elements'] > 20 and page_stats['text_chars'] < page_stats['total_elements'] * 5: + stats_text += 'Page appears to show skeleton/placeholder content (still loading?) - ' stats_text += f'{page_stats["links"]} links, {page_stats["interactive_elements"]} interactive, ' stats_text += f'{page_stats["iframes"]} iframes' if page_stats['shadow_open'] > 0 or page_stats['shadow_closed'] > 0: @@ -252,14 +259,11 @@ class AgentMessagePrompt: pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0 has_content_above = pages_above > 0 has_content_below = pages_below > 0 - total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0 - current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1) page_info_text = '' - page_info_text += f'{pages_above:.1f} above, ' - page_info_text += f'{pages_below:.1f} below ' - + page_info_text += f'{pages_above:.1f} pages above, {pages_below:.1f} pages below' + if pages_below > 0.2: + page_info_text += ' — scroll down to reveal more content' page_info_text += '\n' - # , at {current_page_position:.0%} of page if elements_text != '': if not has_content_above: elements_text = f'[Start of page]\n{elements_text}' diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 6bc7757df..9b2fff1f2 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -36,7 +36,7 @@ from pydantic import BaseModel, ValidationError from uuid_extensions import uuid7str from browser_use import Browser, BrowserProfile, BrowserSession -from browser_use.agent.judge import construct_judge_messages, construct_simple_judge_messages +from browser_use.agent.judge import construct_judge_messages # Lazy import for gif to avoid heavy agent.views import at startup # from browser_use.agent.gif import create_history_gif @@ -59,7 +59,6 @@ from browser_use.agent.views import ( JudgementResult, MessageCompactionSettings, PlanItem, - SimpleJudgeResult, StepMetadata, ) from browser_use.browser.events import _get_timeout @@ -188,6 +187,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): file_system_path: str | None = None, task_id: str | None = None, calculate_cost: bool = False, + pricing_url: str | None = None, display_files_in_done_text: bool = True, include_tool_call_examples: bool = False, vision_detail_level: Literal['auto', 'low', 'high'] = 'auto', @@ -204,7 +204,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): loop_detection_enabled: bool = True, llm_screenshot_size: tuple[int, int] | None = None, message_compaction: MessageCompactionSettings | bool | None = True, + max_clickable_elements_length: int = 40000, _url_shortening_limit: int = 25, + enable_signal_handler: bool = True, **kwargs, ): # Validate llm_screenshot_size @@ -409,16 +411,20 @@ class Agent(Generic[Context, AgentStructuredOutput]): loop_detection_window=loop_detection_window, loop_detection_enabled=loop_detection_enabled, message_compaction=message_compaction, + max_clickable_elements_length=max_clickable_elements_length, ) # Token cost service - self.token_cost_service = TokenCost(include_cost=calculate_cost) + self.token_cost_service = TokenCost(include_cost=calculate_cost, pricing_url=pricing_url) self.token_cost_service.register_llm(llm) self.token_cost_service.register_llm(page_extraction_llm) self.token_cost_service.register_llm(judge_llm) if self.settings.message_compaction and self.settings.message_compaction.compaction_llm: self.token_cost_service.register_llm(self.settings.message_compaction.compaction_llm) + # Store signal handler setting (not part of AgentSettings as it's runtime behavior) + self.enable_signal_handler = enable_signal_handler + # Initialize state self.state = injected_agent_state or AgentState() @@ -514,6 +520,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): include_recent_events=self.include_recent_events, sample_images=self.sample_images, llm_screenshot_size=llm_screenshot_size, + max_clickable_elements_length=self.settings.max_clickable_elements_length, ) if self.sensitive_data: @@ -1022,9 +1029,35 @@ class Agent(Generic[Context, AgentStructuredOutput]): browser_state_summary = None try: + if self.browser_session: + try: + captcha_wait = await self.browser_session.wait_if_captcha_solving() + if captcha_wait and captcha_wait.waited: + # Reset step timing to exclude the captcha wait from step duration metrics + self.step_start_time = time.time() + duration_s = captcha_wait.duration_ms / 1000 + outcome = captcha_wait.result # 'success' | 'failed' | 'timeout' + msg = f'Waited {duration_s:.1f}s for {captcha_wait.vendor} CAPTCHA to be solved. Result: {outcome}.' + self.logger.info(f'🔒 {msg}') + # Inject the outcome so the LLM sees what happened + captcha_result = ActionResult(long_term_memory=msg) + if self.state.last_result: + self.state.last_result.append(captcha_result) + else: + self.state.last_result = [captcha_result] + except Exception as e: + self.logger.warning(f'Phase 0 captcha wait failed (non-fatal): {e}') + # Phase 1: Prepare context and timing browser_state_summary = await self._prepare_context(step_info) + # Clear previous step state after context preparation (which needs + # them for the "previous action result" prompt) but before the LLM + # call, so a timeout during _get_next_action or _execute_actions + # won't leave stale data from the previous step. + self.state.last_model_output = None + self.state.last_result = None + # Phase 2: Get model output and execute actions await self._get_next_action(browser_state_summary) await self._execute_actions() @@ -1220,12 +1253,31 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.warning(f'{error_msg}') return - # Handle browser closed/disconnected errors - stop immediately instead of retrying - if self._is_browser_closed_error(error): - self.logger.warning(f'🛑 Browser closed or disconnected: {error}') - self.state.stopped = True - self._external_pause_event.set() - return + # Handle browser closed/disconnected errors + if self._is_connection_like_error(error): + # If reconnection is in progress, wait for it instead of stopping + if self.browser_session.is_reconnecting: + wait_timeout = self.browser_session.RECONNECT_WAIT_TIMEOUT + self.logger.warning( + f'🔄 Connection error during reconnection, waiting up to {wait_timeout}s for reconnect: {error}' + ) + try: + await asyncio.wait_for(self.browser_session._reconnect_event.wait(), timeout=wait_timeout) + except TimeoutError: + pass + + # Check if reconnection succeeded + if self.browser_session.is_cdp_connected: + self.logger.info('🔄 Reconnection succeeded, retrying step...') + self.state.last_result = [ActionResult(error=f'Connection lost and recovered: {error}')] + return + + # Not reconnecting or reconnection failed — check if truly terminal + if self._is_browser_closed_error(error): + self.logger.warning(f'🛑 Browser closed or disconnected: {error}') + self.state.stopped = True + self._external_pause_event.set() + return # Handle all other exceptions include_trace = self.logger.isEnabledFor(logging.DEBUG) @@ -1249,14 +1301,35 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.state.last_result = [ActionResult(error=error_msg)] return None + def _is_connection_like_error(self, error: Exception) -> bool: + """Check if the error looks like a CDP/WebSocket connection failure. + + Unlike _is_browser_closed_error(), this does NOT check if the CDP client is None + or if reconnection is in progress — it purely looks at the error signature. + """ + error_str = str(error).lower() + return ( + isinstance(error, ConnectionError) + or 'websocket connection closed' in error_str + or 'connection closed' in error_str + or 'browser has been closed' in error_str + or 'browser closed' in error_str + or 'no browser' in error_str + ) + def _is_browser_closed_error(self, error: Exception) -> bool: """Check if the browser has been closed or disconnected. Only returns True when the error itself is a CDP/WebSocket connection failure - AND the CDP client is gone. Avoids false positives on unrelated errors - (element not found, timeouts, parse errors) that happen to coincide with - a transient None state during reconnects or resets. + AND the CDP client is gone AND we're not actively reconnecting. + Avoids false positives on unrelated errors (element not found, timeouts, + parse errors) that happen to coincide with a transient None state during + reconnects or resets. """ + # During reconnection, don't treat connection errors as terminal + if self.browser_session.is_reconnecting: + return False + error_str = str(error).lower() is_connection_error = ( isinstance(error, ConnectionError) @@ -1504,46 +1577,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): self._message_manager._add_context_message(UserMessage(content=msg)) self.AgentOutput = self.DoneAgentOutput - async def _run_simple_judge(self) -> None: - """Lightweight always-on judge that overrides agent success when it overclaims. - - Runs regardless of use_judge setting. Only checks tasks where the agent - claimed success — if the agent already reports failure, there's nothing to correct. - """ - last_result = self.history.history[-1].result[-1] - if not last_result.is_done or not last_result.success: - return - - task = self.task - final_result = self.history.final_result() or '' - - messages = construct_simple_judge_messages( - task=task, - final_result=final_result, - ) - - try: - response = await self.llm.ainvoke(messages, output_format=SimpleJudgeResult) - result: SimpleJudgeResult = response.completion # type: ignore[assignment] - if not result.is_correct: - reason = result.reason or 'Task requirements not fully met' - self.logger.info(f'⚠️ Simple judge overriding success to failure: {reason}') - last_result.success = False - note = f'[Simple judge: {reason}]' - # When structured output is expected, don't append judge text to extracted_content - # as it would corrupt the JSON and break end-user parsers - if self.output_model_schema is not None: - if last_result.metadata is None: - last_result.metadata = {} - last_result.metadata['simple_judge'] = note - elif last_result.extracted_content: - last_result.extracted_content += f'\n\n{note}' - else: - last_result.extracted_content = note - except Exception as e: - self.logger.warning(f'Simple judge failed with error: {e}') - # Don't override on error — keep the agent's self-report - @observe(ignore_input=True, ignore_output=False) async def _judge_trace(self) -> JudgementResult | None: """Judge the trace of the agent""" @@ -1614,8 +1647,10 @@ class Agent(Generic[Context, AgentStructuredOutput]): if judgement.failure_reason: judge_log += f' Failure Reason: {judgement.failure_reason}\n' if judgement.reached_captcha: - judge_log += ' 🤖 Captcha Detected: Agent encountered captcha challenges\n' - judge_log += ' 👉 🥷 Use Browser Use Cloud for the most stealth browser infra: https://docs.browser-use.com/customize/browser/remote\n' + self.logger.warning( + 'Agent was blocked by a captcha. Cloud browsers include stealth fingerprinting and proxy rotation to avoid this.\n' + ' Try: Browser(use_cloud=True) | Get an API key: https://cloud.browser-use.com?utm_source=oss&utm_medium=captcha_nudge' + ) judge_log += f' {judgement.reasoning}\n' self.logger.info(judge_log) @@ -2023,8 +2058,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): if not (self.logger.isEnabledFor(logging.DEBUG) and parsed.action): return - action_count = len(parsed.action) - # Collect action details action_details = [] for i, action in enumerate(parsed.action): @@ -2129,11 +2162,10 @@ class Agent(Generic[Context, AgentStructuredOutput]): has_captcha_issue = any(keyword in final_result_str for keyword in captcha_keywords) if has_captcha_issue: - # Suggest use_cloud=True for captcha/cloudflare issues - task_preview = self.task[:10] if len(self.task) > 10 else self.task - self.logger.info('') - self.logger.info('Failed because of CAPTCHA? For better browser stealth, try:') - self.logger.info(f' agent = Agent(task="{task_preview}...", browser=Browser(use_cloud=True))') + self.logger.warning( + 'Agent was blocked by a captcha. Cloud browsers include stealth fingerprinting and proxy rotation to avoid this.\n' + ' Try: Browser(use_cloud=True) | Get an API key: https://cloud.browser-use.com?utm_source=oss&utm_medium=captcha_nudge' + ) # General failure message self.logger.info('') @@ -2225,9 +2257,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): await self.step(step_info) if self.history.is_done(): - # Always run simple judge to align agent success with reality - await self._run_simple_judge() - await self.log_completion() # Run full judge before done callback if enabled @@ -2424,14 +2453,15 @@ class Agent(Generic[Context, AgentStructuredOutput]): await self._demo_mode_log(error_msg, 'error', {'step': step + 1}) self.state.consecutive_failures += 1 self.state.last_result = [ActionResult(error=error_msg)] + # Ensure step counter advances on timeout — _finalize() may have + # been skipped or returned early due to the cancellation. + if self.state.n_steps == step + 1: + self.state.n_steps += 1 if on_step_end is not None: await on_step_end(self) if self.history.is_done(): - # Always run simple judge to align agent success with reality - await self._run_simple_judge() - await self.log_completion() # Run full judge before done callback if enabled @@ -2480,6 +2510,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): resume_callback=self.resume, custom_exit_callback=on_force_exit_log_telemetry, # Pass the new telemetrycallback exit_on_second_int=True, + disabled=not self.enable_signal_handler, ) signal_handler.register() @@ -2672,7 +2703,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): to pre-action values. Any change aborts the remaining queue. """ results: list[ActionResult] = [] - time_elapsed = 0 total_actions = len(actions) assert self.browser_session is not None, 'BrowserSession is not set up' @@ -2682,19 +2712,20 @@ class Agent(Generic[Context, AgentStructuredOutput]): and self.browser_session._cached_browser_state_summary.dom_state is not None ): cached_selector_map = dict(self.browser_session._cached_browser_state_summary.dom_state.selector_map) - cached_element_hashes = {e.parent_branch_hash() for e in cached_selector_map.values()} else: cached_selector_map = {} - cached_element_hashes = set() except Exception as e: self.logger.error(f'Error getting cached selector map: {e}') cached_selector_map = {} - cached_element_hashes = set() for i, action in enumerate(actions): + # Get action name from the action model BEFORE try block to ensure it's always available in except + action_data = action.model_dump(exclude_unset=True) + action_name = next(iter(action_data.keys())) if action_data else 'unknown' + if i > 0: # ONLY ALLOW TO CALL `done` IF IT IS A SINGLE ACTION - if action.model_dump(exclude_unset=True).get('done') is not None: + if action_data.get('done') is not None: msg = f'Done action is allowed only as a single action - stopped after action {i} / {total_actions}.' self.logger.debug(msg) break @@ -2706,9 +2737,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): try: await self._check_stop_or_pause() - # Get action name from the action model - action_data = action.model_dump(exclude_unset=True) - action_name = next(iter(action_data.keys())) if action_data else 'unknown' # Log action before execution await self._log_action(action, action_name, i + 1, total_actions) @@ -2717,8 +2745,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): pre_action_url = await self.browser_session.get_current_page_url() pre_action_focus = self.browser_session.agent_focus_target_id - time_start = time.time() - result = await self.tools.act( action=action, browser_session=self.browser_session, @@ -2729,9 +2755,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): extraction_schema=self.extraction_schema, ) - time_end = time.time() - time_elapsed = time_end - time_start - if result.error: await self._demo_mode_log( f'Action "{action_name}" failed: {result.error}', @@ -3429,7 +3452,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): hist_node = historical_elem.node_name.lower() if historical_elem else '' similar_elements = [] if historical_elem and historical_elem.attributes: - hist_aria = historical_elem.attributes.get('aria-label', '') for idx, elem in selector_map.items(): if elem.node_name.lower() == hist_node and elem.attributes: elem_aria = elem.attributes.get('aria-label', '') @@ -3911,6 +3933,17 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Kill the browser session - this dispatches BrowserStopEvent, # stops the EventBus with clear=True, and recreates a fresh EventBus await self.browser_session.kill() + else: + # keep_alive=True sessions shouldn't keep the event loop alive after agent.run() + await self.browser_session.event_bus.stop( + clear=False, + timeout=_get_timeout('TIMEOUT_BrowserSessionEventBusStopOnAgentClose', 1.0), + ) + try: + self.browser_session.event_bus.event_queue = None + self.browser_session.event_bus._on_idle = None + except Exception: + pass # Close skill service if configured if self.skill_service is not None: diff --git a/browser_use/agent/system_prompts/system_prompt.md b/browser_use/agent/system_prompts/system_prompt.md index 9af905048..82cb2ca32 100644 --- a/browser_use/agent/system_prompts/system_prompt.md +++ b/browser_use/agent/system_prompts/system_prompt.md @@ -40,18 +40,25 @@ USER REQUEST: This is your ultimate objective and always remains visible. 1. Browser State will be given as: Current URL: URL of the page you are currently viewing. Open Tabs: Open tabs with their ids. -Interactive Elements: All interactive elements will be provided in format as [index]text where -- index: Numeric identifier for interaction -- type: HTML element type (button, input, etc.) -- text: Element description +Interactive Elements: All interactive elements will be provided in a tree-style XML format: +- Format: `[index]` for interactive elements +- Text content appears as child nodes on separate lines (not inside tags) +- Indentation with tabs shows parent/child relationships Examples: -[33]
User form
-\t*[35] +[33]
+ User form + [35] + *[38]