Merge branch 'main' into mert/hotfix

This commit is contained in:
Mert Unsal
2025-06-24 00:21:58 +02:00
committed by GitHub
113 changed files with 4388 additions and 3428 deletions

View File

@@ -25,6 +25,19 @@ jobs:
LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }}
steps:
- name: System Info and Resource Check
run: |
echo "=== SYSTEM INFORMATION ==="
echo "Runner OS: $(uname -a)"
echo "CPU Info: $(nproc) cores"
echo "Memory Info:"
free -h
echo "Disk Space:"
df -h
echo "Load Average:"
uptime
echo "=========================="
- name: Determine ref to checkout
id: determine_ref
run: |
@@ -47,7 +60,10 @@ jobs:
activate-environment: true
- name: Install dependencies
run: uv sync --extra eval
run: |
echo "Installing dependencies..."
uv sync --extra eval
echo "Dependencies installed successfully"
- name: Detect installed Playwright version
id: playwright_version
@@ -62,13 +78,31 @@ jobs:
${{ runner.os }}-playwright-
- name: Install Playwright browser dependencies
run: playwright install --no-shell chromium
run: |
echo "Installing Playwright browsers..."
playwright install --no-shell chromium
echo "Playwright browsers installed successfully"
- name: Install Xvfb for headed mode
if: github.event.client_payload.script_args.headless == 'false'
run: |
echo "Installing Xvfb for headed mode..."
sudo apt-get update
sudo apt-get install -y xvfb
echo "Xvfb installed successfully"
- name: Pre-execution Resource Check
run: |
echo "=== PRE-EXECUTION RESOURCE CHECK ==="
echo "Memory usage:"
free -h
echo "CPU load:"
uptime
echo "Disk usage:"
df -h
echo "Process count:"
ps aux | wc -l
echo "================================="
- name: Construct eval command
id: eval_command
@@ -82,7 +116,7 @@ jobs:
DEFAULT_END_INDEX="100"
DEFAULT_EVAL_GROUP="PRTests"
DEFAULT_HEADLESS="true"
DEFAULT_FRESH_START="true"
DEFAULT_MEMORY_INTERVAL="10"
DEFAULT_MAX_ACTIONS_PER_STEP="10"
DEFAULT_PLANNER_INTERVAL="1"
@@ -113,8 +147,7 @@ jobs:
HEADLESS="${{ github.event.client_payload.script_args.headless }}"
HEADLESS="${HEADLESS:-$DEFAULT_HEADLESS}"
FRESH_START="${{ github.event.client_payload.script_args.fresh_start }}"
FRESH_START="${FRESH_START:-$DEFAULT_FRESH_START}"
MEMORY_INTERVAL="${{ github.event.client_payload.script_args.memory_interval }}"
MEMORY_INTERVAL="${MEMORY_INTERVAL:-$DEFAULT_MEMORY_INTERVAL}"
@@ -133,6 +166,7 @@ jobs:
DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}"
PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}"
RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}"
# Build command using array for cleaner construction
CMD_ARGS=(
@@ -143,7 +177,7 @@ jobs:
"--max-steps" "$MAX_STEPS"
"--start" "$START_INDEX"
"--end" "$END_INDEX"
"--fresh-start" "$FRESH_START"
"--eval-group" "$EVAL_GROUP"
"--memory-interval" "$MEMORY_INTERVAL"
"--max-actions-per-step" "$MAX_ACTIONS_PER_STEP"
@@ -158,12 +192,15 @@ jobs:
[[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory")
[[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output")
[[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result")
[[ "${{ github.event.client_payload.script_args.highlight_elements }}" == "false" ]] && CMD_ARGS+=("--no-highlight-elements")
[[ "${{ github.event.client_payload.script_args.use_mind2web_judge }}" == "true" ]] && CMD_ARGS+=("--use-mind2web-judge")
# Add optional string parameters
[[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE")
[[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID")
[[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL")
[[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID")
[[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID")
# Convert array to command string with proper escaping
printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"
@@ -176,5 +213,179 @@ jobs:
echo "FULL_COMMAND=$CMD_STRING" >> $GITHUB_OUTPUT
echo "::notice title=Eval Command::$CMD_STRING"
- name: Start Resource Monitoring
run: |
echo "Starting background resource monitoring..."
# Create a background script that monitors resources every 30 seconds
cat > monitor_resources.sh << 'EOF'
#!/bin/bash
while true; do
echo "=== RESOURCE MONITOR $(date) ==="
echo "Memory:"
free -h
echo "CPU Load:"
uptime
echo "Top processes by CPU:"
ps aux --sort=-%cpu | head -10
echo "Top processes by Memory:"
ps aux --sort=-%mem | head -10
echo "Chrome/Chromium processes:"
ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
echo "Python processes:"
ps aux | grep python | grep -v grep || echo "No Python processes found"
echo "=================================="
sleep 30
done
EOF
chmod +x monitor_resources.sh
# Start the monitor in background and save PID
nohup ./monitor_resources.sh > resource_monitor.log 2>&1 &
echo $! > monitor_pid.txt
echo "Resource monitoring started with PID: $(cat monitor_pid.txt)"
- name: Run evaluation script
run: ${{ steps.eval_command.outputs.FULL_COMMAND }}
id: run_eval
run: |
echo "=== STARTING EVALUATION ==="
echo "Command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
echo "Starting time: $(date)"
echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
echo "============================"
# Set up signal handlers and run the command
set -e
trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log; exit 130' INT TERM
# Run the evaluation with output capture and better error handling
set +e # Don't exit on errors, we want to capture them
${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
EVAL_EXIT_CODE=${PIPESTATUS[0]}
set -e # Re-enable exit on error
echo "=== EVALUATION COMPLETED ==="
echo "Exit code: $EVAL_EXIT_CODE"
echo "Completion time: $(date)"
echo "============================"
# Show last part of log for context
if [ $EVAL_EXIT_CODE -ne 0 ]; then
echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
tail -100 eval_output.log
echo "=================================================="
fi
exit $EVAL_EXIT_CODE
- name: Post-execution Resource Check
if: always()
run: |
echo "=== POST-EXECUTION RESOURCE CHECK ==="
echo "Memory usage:"
free -h
echo "CPU load:"
uptime
echo "Disk usage:"
df -h
echo "Process count:"
ps aux | wc -l
echo "Chrome/Chromium processes still running:"
ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
echo "Python processes still running:"
ps aux | grep python | grep -v grep || echo "No Python processes found"
echo "==================================="
- name: Stop Resource Monitoring and Collect Logs
if: always()
run: |
echo "Stopping resource monitoring..."
if [ -f monitor_pid.txt ]; then
MONITOR_PID=$(cat monitor_pid.txt)
if kill -0 $MONITOR_PID 2>/dev/null; then
kill $MONITOR_PID
echo "Resource monitor stopped"
else
echo "Resource monitor was already stopped"
fi
fi
echo "=== RESOURCE MONITORING LOG ==="
if [ -f resource_monitor.log ]; then
tail -100 resource_monitor.log
else
echo "No resource monitor log found"
fi
echo "==============================="
- name: Collect Debug Information
if: always()
run: |
echo "=== COLLECTING DEBUG INFORMATION ==="
# System information
echo "Final system state:"
uptime
free -h
df -h
# Process information
echo "All running processes:"
ps aux --sort=-%cpu | head -20
# Check for core dumps
echo "Checking for core dumps:"
find . -name "core*" -type f 2>/dev/null || echo "No core dumps found"
# Check for any crash logs
echo "Checking for crash logs:"
find . -name "*crash*" -type f 2>/dev/null || echo "No crash logs found"
# Check kernel messages for OOM kills
echo "Checking for OOM kills in kernel messages:"
sudo dmesg | grep -i "killed process" | tail -10 || echo "No OOM kills found"
# Check evaluation output
echo "Last 100 lines of evaluation output:"
if [ -f eval_output.log ]; then
tail -100 eval_output.log
else
echo "No evaluation output log found"
fi
# Check for saved trajectories
echo "Saved trajectories directory:"
if [ -d saved_trajectories ]; then
find saved_trajectories -type f -name "*.json" | head -10
echo "Total trajectory files: $(find saved_trajectories -type f -name "*.json" | wc -l)"
else
echo "No saved_trajectories directory found"
fi
echo "===================================="
- name: Upload Debug Artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: debug-logs-${{ github.run_id }}
path: |
eval_output.log
resource_monitor.log
saved_trajectories/
retention-days: 7
- name: Final Status Summary
if: always()
run: |
echo "=== FINAL STATUS SUMMARY ==="
echo "Workflow run ID: ${{ github.run_id }}"
echo "Job completion time: $(date)"
echo "Evaluation step status: ${{ steps.run_eval.outcome }}"
if [ "${{ steps.run_eval.outcome }}" != "success" ]; then
echo "❌ Evaluation failed or was interrupted"
echo "Check the debug artifacts and logs above for more information"
else
echo "✅ Evaluation completed successfully"
fi
echo "==========================="

View File

@@ -39,4 +39,5 @@ jobs:
- uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors
- run: uv run pyright

View File

@@ -5,20 +5,22 @@ We want our library APIs to be ergonomic, intuitive, and hard to get wrong.
- Use async python
- Use tabs for indentation in all python code, not spaces
- Use the modern python >3.12 typing style, e.g. use `str | None` instead of `Optional[str]`, and `list[str]` instead of `List[str]`
- Use the modern python >3.12 typing style, e.g. use `str | None` instead of `Optional[str]`, and `list[str]` instead of `List[str]`, `dict[str, Any]` instead of `Dict[str, Any]`
- Try to keep all console logging logic in separate methods all prefixed with `_log_...`, e.g. `def _log_pretty_path(path: Path) -> str` so as not to clutter up the main logic.
- Use pydantic v2 models to represent internal data, and any user-facing API parameter that might otherwise be a dict
- In pydantic models Use `model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True, ...)` etc. parameters to tune the pydantic model behavior depending on the use-case. Use `Annotated[..., AfterValidator(...)]` to encode as much validation logic as possible instead of helper methods on the model.
- We keep the main code for each sub-component in a `service.py` file usually, and we keep most pydantic models in `views.py` files unless they are long enough deserve their own file
- Use runtime assertions at the start and end of functions to enforce constraints and assumptions
- Prefer `from uuid_extensions import uuid7str` + `id: str = Field(default_factory=uuid7str)` for all new id fields
- Run tests using `uv run pytest -vxs tests/ci`
- Run the type checker using `uv run pyright`
## Keep Examples & Tests Up-To-Date
- Make sure to read relevant examples in the `examples/` directory for context and keep them up-to-date when making changes.
- Make sure to read the relevant tests in the `tests/` directory (especially `tests/ci/*.py`) and keep them up-to-date as well.
- Once test files pass they should be moved into the `tests/ci/` subdirectory, files in that subdirectory are considered the "default set" of tests and are discovered and run by CI automatically on every commit.
- Try to almost never use mocks in tests, instead use pytest fixtures to set up real objects
- Never use mocks in tests other than for the llm, instead use pytest fixtures to set up real objects and pytest-httpserver
- Never use real remote URLs in tests (e.g. `https://google.com` or `https://example.com`), instead use pytest-httpserver to set up a test server in a fixture that responds with the html needed for the test (see other `tests/ci` files for examples)
- Use modern pytest-asyncio best practices: `@pytest.mark.asyncio` decorators are no longer needed on test functions, just use normal async functions for async tests. Use `loop = asyncio.get_event_loop()` inside tests that need it instead of passing `event_loop` as a function argument. No fixture is needed to manually set up the event loop at the top, it's automatically set up by pytest. Fixture functions (even async ones) only need a simple `@pytest.fixture` decorator with no arguments.

View File

@@ -9,4 +9,5 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$SCRIPT_DIR/.." || exit 1
exec uv run pre-commit run --all-files
uv run pre-commit run --all-files
exec pyright

View File

@@ -5,8 +5,8 @@ from pydantic import Field, field_validator
from uuid_extensions import uuid7str
MAX_STRING_LENGTH = 100000 # 100K chars ~ 25k tokens should be enough
MAX_URL_LENGTH = 10000
MAX_TASK_LENGTH = 10000
MAX_URL_LENGTH = 100000
MAX_TASK_LENGTH = 100000
MAX_COMMENT_LENGTH = 2000
MAX_FILE_CONTENT_SIZE = 50 * 1024 * 1024 # 50MB
@@ -41,6 +41,9 @@ class UpdateAgentTaskEvent(BaseEvent):
done_output=done_output,
finished_at=datetime.now(timezone.utc) if agent.state.history and agent.state.history.is_done() else None,
agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
user_feedback_type=None,
user_comment=None,
gif_url=None,
# user_feedback_type and user_comment would be set by the API/frontend
# gif_url would be set after GIF generation if needed
)
@@ -192,6 +195,9 @@ class CreateAgentTaskEvent(BaseEvent):
done_output=None,
started_at=datetime.fromtimestamp(agent._task_start_time, tz=timezone.utc),
finished_at=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)

View File

@@ -8,6 +8,7 @@ import platform
from typing import TYPE_CHECKING
from browser_use.agent.views import AgentHistoryList
from browser_use.config import CONFIG
if TYPE_CHECKING:
from PIL import Image, ImageFont
@@ -80,7 +81,7 @@ def create_history_gif(
try:
if platform.system() == 'Windows':
# Need to specify the abs font path on Windows
font_name = os.path.join(os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts'), font_name + '.ttf')
font_name = os.path.join(CONFIG.WIN_FONT_DIR, font_name + '.ttf')
regular_font = ImageFont.truetype(font_name, font_size)
title_font = ImageFont.truetype(font_name, title_font_size)
goal_font = ImageFont.truetype(font_name, goal_font_size)

View File

@@ -15,6 +15,7 @@ from langchain_core.messages.utils import convert_to_openai_messages
from browser_use.agent.memory.views import MemoryConfig
from browser_use.agent.message_manager.service import MessageManager
from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata
from browser_use.config import CONFIG
from browser_use.utils import time_execution_sync
@@ -65,7 +66,7 @@ class Memory:
# Check for required packages
try:
# also disable mem0's telemetry when ANONYMIZED_TELEMETRY=False
if os.getenv('ANONYMIZED_TELEMETRY', 'true').lower()[0] in 'fn0':
if not CONFIG.ANONYMIZED_TELEMETRY:
os.environ['MEM0_TELEMETRY'] = 'False'
from mem0 import Memory as Mem0Memory
except ImportError:

View File

@@ -80,7 +80,7 @@ class MemoryConfig(BaseModel):
Returns the vector store configuration dictionary for Mem0,
tailored to the selected provider.
"""
provider_specific_config = {'embedding_model_dims': self.embedder_dims}
provider_specific_config: dict[str, Any] = {'embedding_model_dims': self.embedder_dims}
# --- Default collection_name handling ---
if self.vector_store_collection_name:
@@ -167,7 +167,7 @@ class MemoryConfig(BaseModel):
}
@property
def full_config_dict(self) -> dict[str, dict[str, Any]]:
def full_config_dict(self) -> dict[str, Any]:
"""Returns the complete configuration dictionary for Mem0."""
return {
'embedder': self.embedder_config_dict,

View File

@@ -13,7 +13,7 @@ from langchain_core.messages import (
)
from pydantic import BaseModel
from browser_use.agent.message_manager.views import MessageMetadata
from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata
from browser_use.agent.prompts import AgentMessagePrompt
from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
from browser_use.browser.views import BrowserStateSummary
@@ -109,7 +109,7 @@ def _log_extract_message_content(message: BaseMessage, is_last_message: bool, me
cleaned_content = _log_clean_whitespace(str(message.content))
# Handle AIMessages with tool calls
if hasattr(message, 'tool_calls') and message.tool_calls and not cleaned_content:
if isinstance(message, AIMessage) and hasattr(message, 'tool_calls') and message.tool_calls and not cleaned_content:
tool_call = message.tool_calls[0]
tool_name = tool_call.get('name', 'unknown')
@@ -117,7 +117,7 @@ def _log_extract_message_content(message: BaseMessage, is_last_message: bool, me
# Skip formatting for init example messages
if metadata and metadata.message_type == 'init':
return '[Example AgentOutput]'
content = _log_format_agent_output_content(tool_call)
content = _log_format_agent_output_content(dict(tool_call)) # Convert ToolCall to dict
else:
content = f'[TOOL: {tool_name}]'
else:
@@ -141,9 +141,12 @@ def _log_format_message_line(
lines = []
# Get emoji and token info
message_type = message_with_metadata.message.__class__.__name__
emoji = _log_get_message_emoji(message_type)
token_str = str(message_with_metadata.metadata.tokens).rjust(4)
if isinstance(message_with_metadata, ManagedMessage):
message_type = message_with_metadata.message.__class__.__name__
emoji = _log_get_message_emoji(message_type)
token_str = str(message_with_metadata.metadata.tokens).rjust(4)
else:
return ['❓[ ?]: [Invalid message format]']
prefix = f'{emoji}[{token_str}]: '
# Calculate available width (emoji=2 visual cols + [token]: =8 chars)
@@ -201,6 +204,7 @@ class MessageManager:
task: str,
system_message: SystemMessage,
file_system: FileSystem,
available_file_paths: list[str] | None = None,
settings: MessageManagerSettings = MessageManagerSettings(),
state: MessageManagerState = MessageManagerState(),
):
@@ -209,9 +213,10 @@ class MessageManager:
self.state = state
self.system_prompt = system_message
self.file_system = file_system
self.agent_history_description = 'Agent initialized.\n'
self.agent_history_description = '<system>Agent initialized</system>\n'
self.read_state_description = ''
self.sensitive_data_description = ''
self.available_file_paths = available_file_paths
# Only initialize messages if state is empty
if len(self.state.history.messages) == 0:
self._init_messages()
@@ -340,15 +345,9 @@ My next action is to click on the iPhone link at index [4] to navigate to Apple'
# self._add_message_with_tokens(example_tool_call_2, message_type='init')
# self.add_tool_message(content='Clicked on index [4]. </example_2>', message_type='init')
if self.settings.available_file_paths:
filepaths_msg = HumanMessage(
content=f'<available_file_paths>Here are file paths you can use: {self.settings.available_file_paths}</available_file_paths>'
)
self._add_message_with_tokens(filepaths_msg, message_type='init')
def add_new_task(self, new_task: str) -> None:
self.task = new_task
self.agent_history_description += f'\nUser updated USER REQUEST to: {new_task}\n'
self.agent_history_description += f'\n<system>User updated USER REQUEST to: {new_task}</system>\n'
def _update_agent_history_description(
self,
@@ -362,8 +361,7 @@ My next action is to click on the iPhone link at index [4] to navigate to Apple'
result = []
step_number = step_info.step_number if step_info else 'unknown'
self.read_state_initialization = 'This is displayed only **one time**, save this information if you need it later.\n'
self.read_state_description = self.read_state_initialization
self.read_state_description = ''
action_results = ''
result_len = len(result)
@@ -373,36 +371,36 @@ My next action is to click on the iPhone link at index [4] to navigate to Apple'
logger.debug(f'Added extracted_content to read_state_description: {action_result.extracted_content}')
if action_result.long_term_memory:
action_results += f'Action {idx + 1}/{result_len} response: {action_result.long_term_memory}\n'
action_results += f'Action {idx + 1}/{result_len}: {action_result.long_term_memory}\n'
logger.debug(f'Added long_term_memory to action_results: {action_result.long_term_memory}')
elif action_result.extracted_content and not action_result.include_extracted_content_only_once:
action_results += f'Action {idx + 1}/{result_len} response: {action_result.extracted_content}\n'
action_results += f'Action {idx + 1}/{result_len}: {action_result.extracted_content}\n'
logger.debug(f'Added extracted_content to action_results: {action_result.extracted_content}')
if action_result.error:
action_results += f'Action {idx + 1}/{result_len} response: {action_result.error[:200]}\n'
action_results += f'Action {idx + 1}/{result_len}: {action_result.error[:200]}\n'
logger.debug(f'Added error to action_results: {action_result.error[:200]}')
if action_results:
action_results = f'Action Results:\n{action_results}'
action_results = action_results.strip('\n')
# Handle case where model_output is None (e.g., parsing failed)
if model_output is None:
if step_number > 0:
self.agent_history_description += f"""## Step {step_number}
No model output (parsing failed)
{action_results}
if isinstance(step_number, int) and step_number > 0:
self.agent_history_description += f"""<step_{step_number}>
Agent failed to output in the right format.
</step_{step_number}>
"""
else:
self.agent_history_description += f"""## Step {step_number}
Step evaluation: {model_output.current_state.evaluation_previous_goal}
Step memory: {model_output.current_state.memory}
Step goal: {model_output.current_state.next_goal}
self.agent_history_description += f"""<step_{step_number}>
Evaluation of Previous Step: {model_output.current_state.evaluation_previous_goal}
Memory: {model_output.current_state.memory}
Next Goal: {model_output.current_state.next_goal}
{action_results}
</step_{step_number}>
"""
if self.read_state_description == self.read_state_initialization:
self.read_state_description = ''
else:
self.read_state_description += '\nMAKE SURE TO SAVE THIS INFORMATION INTO A FILE OR TO MEMORY IF YOU NEED IT LATER.'
def _get_sensitive_data_description(self, current_page_url) -> str:
sensitive_data = self.settings.sensitive_data
if not sensitive_data:
@@ -454,6 +452,7 @@ Step goal: {model_output.current_state.next_goal}
step_info=step_info,
page_filtered_actions=page_filtered_actions,
sensitive_data=self.sensitive_data_description,
available_file_paths=self.available_file_paths,
).get_user_message(use_vision)
self._add_message_with_tokens(state_message)

View File

@@ -1,246 +0,0 @@
import pytest
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
from browser_use.agent.views import ActionResult
from browser_use.browser.views import BrowserStateSummary, TabInfo
from browser_use.dom.views import DOMElementNode, DOMTextNode
from browser_use.filesystem.file_system import FileSystem
@pytest.fixture(
params=[
ChatOpenAI(model='gpt-4o-mini'),
AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'),
ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None),
],
ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'],
)
def message_manager(request: pytest.FixtureRequest):
task = 'Test task'
action_descriptions = 'Test actions'
import os
import tempfile
import uuid
base_tmp = tempfile.gettempdir() # e.g., /tmp on Unix
file_system_path = os.path.join(base_tmp, str(uuid.uuid4()))
return MessageManager(
task=task,
system_message=SystemMessage(content=action_descriptions),
settings=MessageManagerSettings(
max_input_tokens=1000,
estimated_characters_per_token=3,
image_tokens=800,
),
file_system=FileSystem(file_system_path),
)
def test_initial_messages(message_manager: MessageManager):
"""Test that message manager initializes with system and task messages"""
messages = message_manager.get_messages()
assert len(messages) == 2
assert isinstance(messages[0], SystemMessage)
assert isinstance(messages[1], HumanMessage)
assert 'Test task' in messages[1].content
def test_add_state_message(message_manager: MessageManager):
"""Test adding browser state message"""
state = BrowserStateSummary(
url='https://test.com',
title='Test Page',
element_tree=DOMElementNode(
tag_name='div',
attributes={},
children=[],
is_visible=True,
parent=None,
xpath='//div',
),
selector_map={},
tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
)
message_manager.add_state_message(browser_state_summary=state)
messages = message_manager.get_messages()
assert len(messages) == 3
assert isinstance(messages[2], HumanMessage)
assert 'https://test.com' in messages[2].content
def test_add_state_with_memory_result(message_manager: MessageManager):
"""Test adding state with result that should be included in memory"""
state = BrowserStateSummary(
url='https://test.com',
title='Test Page',
element_tree=DOMElementNode(
tag_name='div',
attributes={},
children=[],
is_visible=True,
parent=None,
xpath='//div',
),
selector_map={},
tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
)
result = ActionResult(extracted_content='Important content', include_in_memory=True)
message_manager.add_state_message(browser_state_summary=state, result=[result])
messages = message_manager.get_messages()
# Should have system, task, extracted content, and state messages
assert len(messages) == 4
assert 'Important content' in messages[2].content
assert isinstance(messages[2], HumanMessage)
assert isinstance(messages[3], HumanMessage)
assert 'Important content' not in messages[3].content
def test_add_state_with_non_memory_result(message_manager: MessageManager):
"""Test adding state with result that should not be included in memory"""
state = BrowserStateSummary(
url='https://test.com',
title='Test Page',
element_tree=DOMElementNode(
tag_name='div',
attributes={},
children=[],
is_visible=True,
parent=None,
xpath='//div',
),
selector_map={},
tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
)
result = ActionResult(extracted_content='Temporary content', include_in_memory=False)
message_manager.add_state_message(browser_state_summary=state, result=[result])
messages = message_manager.get_messages()
# Should have system, task, and combined state+result message
assert len(messages) == 3
assert 'Temporary content' in messages[2].content
assert isinstance(messages[2], HumanMessage)
@pytest.mark.skip('not sure how to fix this')
@pytest.mark.parametrize('max_tokens', [100000, 10000, 5000])
def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens):
"""Test handling of token overflow in a realistic message flow"""
# Set more realistic token limit
message_manager.settings.max_input_tokens = max_tokens
# Create a long sequence of interactions
for i in range(200): # Simulate 40 steps of interaction
# Create state with varying content length
state = BrowserStateSummary(
url=f'https://test{i}.com',
title=f'Test Page {i}',
element_tree=DOMElementNode(
tag_name='div',
attributes={},
children=[
DOMTextNode(
text=f'Content {j} ' * (10 + i), # Increasing content length
is_visible=True,
parent=None,
)
for j in range(5) # Multiple DOM items
],
is_visible=True,
parent=None,
xpath='//div',
),
selector_map={j: f'//div[{j}]' for j in range(5)},
tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')],
)
# Alternate between different types of results
result = None
if i % 2 == 0: # Every other iteration
result = ActionResult(
extracted_content=f'Important content from step {i}' * 5,
include_in_memory=i % 4 == 0, # Include in memory every 4th message
)
# Add state message
if result:
message_manager.add_state_message(browser_state_summary=state, result=[result])
else:
message_manager.add_state_message(browser_state_summary=state)
try:
messages = message_manager.get_messages()
except ValueError as e:
if 'Max token limit reached - history is too long' in str(e):
return # If error occurs, end the test
else:
raise e
assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100
last_msg = messages[-1]
assert isinstance(last_msg, HumanMessage)
if i % 4 == 0:
assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage)
if i % 2 == 0 and not i % 4 == 0:
if isinstance(last_msg.content, list):
assert 'Current url: https://test' in last_msg.content[0]['text']
else:
assert 'Current url: https://test' in last_msg.content
# Add model output every time
from browser_use.agent.views import AgentBrain, AgentOutput
from browser_use.controller.registry.views import ActionModel
output = AgentOutput(
current_state=AgentBrain(
evaluation_previous_goal=f'Success in step {i}',
memory=f'Memory from step {i}',
next_goal=f'Goal for step {i + 1}',
),
action=[ActionModel()],
)
message_manager._remove_last_state_message()
message_manager.add_model_output(output)
# Get messages and verify after each addition
messages = [m.message for m in message_manager.state.history.messages]
# Verify token limit is respected
# Verify essential messages are preserved
assert isinstance(messages[0], SystemMessage) # System prompt always first
assert isinstance(messages[1], HumanMessage) # Task always second
assert 'Test task' in messages[1].content
# Verify structure of latest messages
assert isinstance(messages[-1], AIMessage) # Last message should be model output
assert f'step {i}' in messages[-1].content # Should contain current step info
# Log token usage for debugging
token_usage = message_manager.state.history.current_tokens
token_limit = message_manager.settings.max_input_tokens
# print(f'Step {i}: Using {token_usage}/{token_limit} tokens')
# go through all messages and verify that the token count and total tokens is correct
total_tokens = 0
real_tokens = []
stored_tokens = []
for msg in message_manager.state.history.messages:
total_tokens += msg.metadata.tokens
stored_tokens.append(msg.metadata.tokens)
real_tokens.append(message_manager._count_tokens(msg.message))
assert total_tokens == sum(real_tokens)
assert stored_tokens == real_tokens
assert message_manager.state.history.current_tokens == total_tokens
# pytest -s browser_use/agent/message_manager/tests.py

View File

@@ -31,42 +31,50 @@ def is_model_without_tool_support(model_name: str) -> bool:
def extract_json_from_model_output(content: str | BaseMessage) -> dict:
"""Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
try:
# Extract string content from BaseMessage if needed
content_str: str
if isinstance(content, BaseMessage):
# for langchain_core.messages.BaseMessage
content = content.content
msg_content = content.content
if isinstance(msg_content, list):
content_str = str(msg_content[0]) if msg_content else ''
else:
content_str = msg_content
else:
content_str = content
# If content is wrapped in code blocks, extract just the JSON part
if '```' in content:
if '```' in content_str:
# Find the JSON content between code blocks
content = content.split('```')[1]
content_str = content_str.split('```')[1]
# Remove language identifier if present (e.g., 'json\n')
if '\n' in content:
content = content.split('\n', 1)[1]
if '\n' in content_str:
content_str = content_str.split('\n', 1)[1]
# remove html-like tags before the first { and after the last }
# This handles cases like <|header_start|>assistant<|header_end|> and <function=AgentOutput>
# Only remove content before { if content doesn't already start with {
if not content.strip().startswith('{'):
content = re.sub(r'^.*?(?=\{)', '', content, flags=re.DOTALL)
if not content_str.strip().startswith('{'):
content_str = re.sub(r'^.*?(?=\{)', '', content_str, flags=re.DOTALL)
# Remove common HTML-like tags and patterns at the end, but be more conservative
# Look for patterns like </function>, <|header_start|>, etc. after the JSON
content = re.sub(r'\}(\s*<[^>]*>.*?$)', '}', content, flags=re.DOTALL)
content = re.sub(r'\}(\s*<\|[^|]*\|>.*?$)', '}', content, flags=re.DOTALL)
content_str = re.sub(r'\}(\s*<[^>]*>.*?$)', '}', content_str, flags=re.DOTALL)
content_str = re.sub(r'\}(\s*<\|[^|]*\|>.*?$)', '}', content_str, flags=re.DOTALL)
# Handle extra characters after the JSON, including stray braces
# Find the position of the last } that would close the main JSON object
content = content.strip()
content_str = content_str.strip()
if content.endswith('}'):
if content_str.endswith('}'):
# Try to parse and see if we get valid JSON
try:
json.loads(content)
json.loads(content_str)
except json.JSONDecodeError:
# If parsing fails, try to find the correct end of the JSON
# by counting braces and removing anything after the balanced JSON
brace_count = 0
last_valid_pos = -1
for i, char in enumerate(content):
for i, char in enumerate(content_str):
if char == '{':
brace_count += 1
elif char == '}':
@@ -76,14 +84,14 @@ def extract_json_from_model_output(content: str | BaseMessage) -> dict:
break
if last_valid_pos > 0:
content = content[:last_valid_pos]
content_str = content_str[:last_valid_pos]
# Fix control characters in JSON strings before parsing
# This handles cases where literal control characters appear in JSON values
content = _fix_control_characters_in_json(content)
content_str = _fix_control_characters_in_json(content_str)
# Parse the cleaned content
result_dict = json.loads(content)
result_dict = json.loads(content_str)
# if the key "function" and parameter key like "params"/"args"/"kwargs"/"parameters" are present, the final result is the value of the parameter key
if 'function' in result_dict:

View File

@@ -36,7 +36,7 @@ class SystemPrompt:
"""Load the prompt template from the markdown file."""
try:
# This works both in development and when installed as a package
with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r') as f:
with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r', encoding='utf-8') as f:
self.prompt_template = f.read()
except Exception as e:
raise RuntimeError(f'Failed to load system prompt template: {e}')
@@ -73,9 +73,10 @@ class AgentMessagePrompt:
page_filtered_actions: str | None = None,
max_clickable_elements_length: int = 40000,
sensitive_data: str | None = None,
available_file_paths: list[str] | None = None,
):
self.browser_state: 'BrowserStateSummary' = browser_state_summary
self.file_system: 'FileSystem' | None = file_system
self.file_system: 'FileSystem | None' = file_system
self.agent_history_description: str | None = agent_history_description
self.read_state_description: str | None = read_state_description
self.task: str | None = task
@@ -84,6 +85,7 @@ class AgentMessagePrompt:
self.page_filtered_actions: str | None = page_filtered_actions
self.max_clickable_elements_length: int = max_clickable_elements_length
self.sensitive_data: str | None = sensitive_data
self.available_file_paths: list[str] | None = available_file_paths
assert self.browser_state
def _get_browser_state_description(self) -> str:
@@ -143,7 +145,7 @@ Interactive elements from top layer of the current page inside the viewport{trun
time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
step_info_description += f'Current date and time: {time_str}'
todo_contents = self.file_system.get_todo_contents()
todo_contents = self.file_system.get_todo_contents() if self.file_system else ''
if not len(todo_contents):
todo_contents = '[Current todo.md is empty, fill it with your plan when applicable]'
@@ -152,7 +154,7 @@ Interactive elements from top layer of the current page inside the viewport{trun
{self.task}
</user_request>
<file_system>
{self.file_system.describe()}
{self.file_system.describe() if self.file_system else 'No file system available'}
</file_system>
<todo_contents>
{todo_contents}
@@ -162,13 +164,23 @@ Interactive elements from top layer of the current page inside the viewport{trun
agent_state += f'<sensitive_data>\n{self.sensitive_data}\n</sensitive_data>\n'
agent_state += f'<step_info>\n{step_info_description}\n</step_info>\n'
if self.available_file_paths:
agent_state += '<available_file_paths>\n' + '\n'.join(self.available_file_paths) + '\n</available_file_paths>\n'
return agent_state
def get_user_message(self, use_vision: bool = True) -> HumanMessage:
state_description = '<agent_history>\n' + self.agent_history_description.strip('\n') + '\n</agent_history>\n'
state_description = (
'<agent_history>\n'
+ (self.agent_history_description.strip('\n') if self.agent_history_description else '')
+ '\n</agent_history>\n'
)
state_description += '<agent_state>\n' + self._get_agent_state_description().strip('\n') + '\n</agent_state>\n'
state_description += '<browser_state>\n' + self._get_browser_state_description().strip('\n') + '\n</browser_state>\n'
state_description += '<read_state>\n' + self.read_state_description.strip('\n') + '\n</read_state>\n'
state_description += (
'<read_state>\n'
+ (self.read_state_description.strip('\n') if self.read_state_description else '')
+ '\n</read_state>\n'
)
if self.page_filtered_actions:
state_description += 'For this page, these additional actions are available:\n'
state_description += self.page_filtered_actions + '\n'

View File

@@ -17,6 +17,15 @@ from typing import Any, Generic, TypeVar
from dotenv import load_dotenv
load_dotenv()
# from lmnr.sdk.decorators import observe
from bubus import EventBus
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
from pydantic import BaseModel, ValidationError
from uuid_extensions import uuid7str
from browser_use.agent.cloud_events import (
CreateAgentOutputFileEvent,
CreateAgentSessionEvent,
@@ -24,20 +33,6 @@ from browser_use.agent.cloud_events import (
CreateAgentTaskEvent,
UpdateAgentTaskEvent,
)
load_dotenv()
# from lmnr.sdk.decorators import observe
from bubus import EventBus
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import (
BaseMessage,
HumanMessage,
SystemMessage,
)
from pydantic import BaseModel, ValidationError
from uuid_extensions import uuid7str
from browser_use.agent.gif import create_history_gif
from browser_use.agent.memory import Memory, MemoryConfig
from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
@@ -65,11 +60,13 @@ from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.browser.session import DEFAULT_BROWSER_PROFILE
from browser_use.browser.types import Browser, BrowserContext, Page
from browser_use.browser.views import BrowserStateSummary
from browser_use.config import CONFIG
from browser_use.controller.registry.views import ActionModel
from browser_use.controller.service import Controller
from browser_use.dom.history_tree_processor.service import DOMHistoryElement, HistoryTreeProcessor
from browser_use.exceptions import LLMException
from browser_use.filesystem.file_system import FileSystem
from browser_use.sync import CloudSync
from browser_use.telemetry.service import ProductTelemetry
from browser_use.telemetry.views import AgentTelemetryEvent
from browser_use.utils import (
@@ -82,8 +79,6 @@ from browser_use.utils import (
logger = logging.getLogger(__name__)
SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1'
def log_response(response: AgentOutput, registry=None, logger=None) -> None:
"""Utility function to log the model's response."""
@@ -184,6 +179,7 @@ class Agent(Generic[Context]):
source: str | None = None,
file_system_path: str | None = None,
task_id: str | None = None,
cloud_sync: CloudSync | None = None,
):
if page_extraction_llm is None:
page_extraction_llm = llm
@@ -304,6 +300,7 @@ class Agent(Generic[Context]):
sensitive_data=sensitive_data,
available_file_paths=self.settings.available_file_paths,
),
available_file_paths=self.settings.available_file_paths,
state=self.state.message_manager_state,
)
@@ -442,18 +439,14 @@ class Agent(Generic[Context]):
self.telemetry = ProductTelemetry()
# Event bus with WAL persistence
# Default to ~/.config/browseruse/events/{agent_task_id}.jsonl
from browser_use.utils import BROWSER_USE_CONFIG_DIR
wal_path = BROWSER_USE_CONFIG_DIR / 'events' / f'{self.task_id}.jsonl'
# Default to ~/.config/browseruse/events/{agent_session_id}.jsonl
wal_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'events' / f'{self.session_id}.jsonl'
self.eventbus = EventBus(name='Agent', wal_path=wal_path)
# Cloud sync service
self.enable_cloud_sync = os.environ.get('BROWSERUSE_CLOUD_SYNC', 'true').lower()[0] in 'ty1'
if self.enable_cloud_sync:
from browser_use.sync import CloudSync
self.cloud_sync = CloudSync()
self.enable_cloud_sync = CONFIG.BROWSER_USE_CLOUD_SYNC
if self.enable_cloud_sync or cloud_sync is not None:
self.cloud_sync = cloud_sync or CloudSync()
# Register cloud sync handler
self.eventbus.on('*', self.cloud_sync.handle_event)
@@ -501,36 +494,6 @@ class Agent(Generic[Context]):
logger.info(f'💾 File system path: {self.file_system_path}')
# if file system is set, add actions to the controller
@self.controller.registry.action('Write content to file_name in file system, use only .md or .txt extensions.')
async def write_file(file_name: str, content: str):
result = await self.file_system.write_file(file_name, content)
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
@self.controller.registry.action('Append content to file_name in file system')
async def append_file(file_name: str, content: str):
result = await self.file_system.append_file(file_name, content)
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
@self.controller.registry.action('Read file_name from file system')
async def read_file(file_name: str):
result = await self.file_system.read_file(file_name)
max_len = 50
if len(result) > max_len:
display_result = result[:max_len] + '\n...'
else:
display_result = result
logger.info(f'💾 {display_result}')
memory = result.split('\n')[-1]
return ActionResult(
extracted_content=result,
include_in_memory=True,
long_term_memory=memory,
include_extracted_content_only_once=True,
)
def _set_message_context(self) -> str | None:
if self.tool_calling_method == 'raw':
# For raw tool calling, only include actions with no filters initially
@@ -819,7 +782,7 @@ class Agent(Generic[Context]):
# If a specific method is set, use it
if self.settings.tool_calling_method != 'auto':
# Skip test if already verified
if getattr(self.llm, '_verified_api_keys', None) is True or SKIP_LLM_API_KEY_VERIFICATION:
if getattr(self.llm, '_verified_api_keys', None) is True or CONFIG.SKIP_LLM_API_KEY_VERIFICATION:
setattr(self.llm, '_verified_api_keys', True)
setattr(self.llm, '_verified_tool_calling_method', self.settings.tool_calling_method)
return self.settings.tool_calling_method
@@ -847,7 +810,7 @@ class Agent(Generic[Context]):
known_method = self._get_known_tool_calling_method()
if known_method is not None:
# Trust known combinations without testing if verification is already done or skipped
if getattr(self.llm, '_verified_api_keys', None) is True or SKIP_LLM_API_KEY_VERIFICATION:
if getattr(self.llm, '_verified_api_keys', None) is True or CONFIG.SKIP_LLM_API_KEY_VERIFICATION:
setattr(self.llm, '_verified_api_keys', True)
setattr(self.llm, '_verified_tool_calling_method', known_method) # Cache on LLM instance
self.logger.debug(
@@ -1713,7 +1676,7 @@ class Agent(Generic[Context]):
assert browser_state_summary
content = AgentMessagePrompt(
browser_state_summary=browser_state_summary,
result=self.state.last_result,
file_system=self.file_system,
include_attributes=self.settings.include_attributes,
)
msg = [SystemMessage(content=system_msg), content.get_user_message(self.settings.use_vision)]
@@ -1960,7 +1923,7 @@ class Agent(Generic[Context]):
self.tool_calling_method = self._set_tool_calling_method()
# Skip verification if already done
if getattr(self.llm, '_verified_api_keys', None) is True or SKIP_LLM_API_KEY_VERIFICATION:
if getattr(self.llm, '_verified_api_keys', None) is True or CONFIG.SKIP_LLM_API_KEY_VERIFICATION:
setattr(self.llm, '_verified_api_keys', True)
return True

View File

@@ -1,4 +1,4 @@
You are a tool-using AI agent designed operating in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
<intro>
You excel at following tasks:
@@ -12,46 +12,38 @@ You excel at following tasks:
<language_settings>
- Default working language: **English**
- Use the language specified by user in messages as the working language in all messages and tool calls
- Use the language specified by user in messages as the working language
</language_settings>
<input>
At every step, you will be given a state with:
1. Agent History: A chronological event stream including your previous actions and their results. This may be partially omitted.
2. User Request: This is your ultimate objective and always remains visible.
3. Agent State: Current progress, and relevant contextual memory.
4. Browser State: Contains current URL, open tabs, interactive elements indexed for actions, visible page content, and (sometimes) screenshots.
4. Read State: If your previous action involved reading a file or extracting content (e.g., from a webpage), the full result will be included here. This data is **only shown in the current step** and will not appear in future Agent History. You are responsible for saving or interpreting the information appropriately during this step into your file system.
At every step, your input will consist of:
1. <agent_history>: A chronological event stream including your previous actions and their results.
2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
</input>
<agent_history>
Agent history will be given as a list of step information as follows:
Step step_number:
<step_{{step_number}}>:
Evaluation of Previous Step: Assessment of last action
Memory: Agent generated memory of this step
Actions: Agent generated actions
Action Results: System generated result of those actions
Memory: Your memory of this step
Next Goal: Your goal for this step
Action Results: Your actions and their results
</step_{{step_number}}>
and system messages wrapped in <system> tag.
</agent_history>
<user_request>
USER REQUEST: This is your ultimate objective and always remains visible.
- This has the highest priority. Make the user happy.
- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
- If the task is open ended you can plan more yourself how to get it done.
- If the task is open ended you can plan yourself how to get it done.
</user_request>
<agent_state>
Agent State will be given as follows:
File System: A summary of your available files in the format:
- file_name — num_lines lines
Current Step: The step in the agent loop.
Timestamp: Current date.
</agent_state>
<browser_state>
1. Browser State will be given as:
@@ -74,14 +66,10 @@ Note that:
</browser_state>
<browser_vision>
When a screenshot is provided, analyse it to understand the interactive elements and try to understand what each interactive element is for. Bounding box labels correspond to element indexes.
You will be optionally provided with a screenshot of the browser with bounding boxes. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
Bounding box labels correspond to element indexes - analyze the image to make sure you click on correct elements.
</browser_vision>
<read_state>
1. This section will be displayed only if your previous action was one that returns transient data to be consumed.
2. You will see this information **only during this step** in your state. ALWAYS make sure to save this information if it will be needed later.
</read_state>
<browser_rules>
Strictly follow these rules while using the browser and navigating the web:
- Only interact with elements that have a numeric [index] assigned.
@@ -91,12 +79,13 @@ Strictly follow these rules while using the browser and navigating the web:
- By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. The extract content action gets the full loaded page content.
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- Use multiple actions where no page transition is expected (e.g., fill multiple fields then click submit).
- If the page is not fully loaded, use the wait action.
- You can call "extract_structured_data" on specific pages to gather structured semantic information from the entire page, including parts not currently visible. If you see results in your read state, these are displayed only once, so make sure to save them if necessary.
- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible. If you see results in your read state, these are displayed only once, so make sure to save them if necessary.
- Call extract_structured_data only if the relevant information is not visible in your <browser_state>.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the USER REQUEST includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient. Sometimes you need to scroll to see all filter options.
- The USER REQUEST is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
</browser_rules>
<file_system>
@@ -105,9 +94,11 @@ Strictly follow these rules while using the browser and navigating the web:
1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
- You can read, write, and append to files.
- Note that `write_file` rewrites the entire file, so make sure to repeat all the existing information if you use this action.
- Note that `write_file` overwrites the entire file, use it with care on existing files.
- When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
</file_system>
<task_completion_rules>
@@ -129,11 +120,11 @@ The `done` action is your opportunity to terminate and share your findings with
- You are allowed to use a maximum of {max_actions} actions per step.
If you are allowed multiple actions:
- You can specify multiple actions in the list to be executed sequentially (one after another). But always specify only one action name per item.
- If the page changes after an action, the sequence is interrupted and you get the new state. You might have to repeat the same action again so that your changes are reflected in the new state.
- ONLY use multiple actions when actions should not change the page state significantly.
- You can specify multiple actions in the list to be executed sequentially (one after another).
- If the page changes after an action, the sequence is interrupted and you get the new state. You can see this in your agent history when this happens.
- At every step, use ONLY ONE action to interact with the browser. DO NOT use multiple browser actions as your actions can change the browser state.
If you are allowed 1 action, ALWAYS output only 1 most reasonable action per step. If you have something in your read_state, always prioritize saving the data first.
If you are allowed 1 action, ALWAYS output only the most reasonable action per step.
</action_rules>
<reasoning_rules>
@@ -147,8 +138,10 @@ Exhibit the following reasoning patterns to successfully achieve the <user_reque
- If todo.md is empty and the task is multi-step, generate a stepwise plan in todo.md using file tools.
- Analyze `todo.md` to guide and track your progress.
- If any todo.md items are finished, mark them as complete in the file.
- Analyze whether you are stuck in the same goal for a few steps. If so, try alternative methods.
- Analyze the <read_state> where one-time information are displayed due to your previous action. Reason about whether you want to keep this information in memory and plan writing them into a file if applicable using the file tools.
- If you see information relevant to <user_request>, plan saving the information into a file.
- Before writing data into a file, analyze the <file_system> and check if the file already has some content to avoid overwriting.
- Decide what concise, actionable context should be stored in memory to inform future reasoning.
- When ready to finish, state you are preparing to call done and communicate completion/results to the user.
- Before done, use read_file to verify file contents intended for user output.

View File

@@ -1,197 +0,0 @@
import pytest
from browser_use.agent.views import (
ActionResult,
AgentBrain,
AgentHistory,
AgentHistoryList,
AgentOutput,
)
from browser_use.browser.views import BrowserStateHistory, BrowserStateSummary, TabInfo
from browser_use.controller.registry.service import Registry
from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction
from browser_use.dom.views import DOMElementNode
@pytest.fixture
def sample_browser_state():
return BrowserStateSummary(
url='https://example.com',
title='Example Page',
tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)],
screenshot='screenshot1.png',
element_tree=DOMElementNode(
tag_name='root',
is_visible=True,
parent=None,
xpath='',
attributes={},
children=[],
),
selector_map={},
)
@pytest.fixture
def action_registry():
registry = Registry()
# Register the actions we need for testing
@registry.action(description='Click an element', param_model=ClickElementAction)
def click_element(params: ClickElementAction, browser=None):
pass
@registry.action(
description='Extract page content',
param_model=ExtractPageContentAction,
)
def extract_page_content(params: ExtractPageContentAction, browser=None):
pass
@registry.action(description='Mark task as done', param_model=DoneAction)
def done(params: DoneAction):
pass
# Create the dynamic ActionModel with all registered actions
return registry.create_action_model()
@pytest.fixture
def sample_history(action_registry):
# Create actions with nested params structure
click_action = action_registry(click_element={'index': 1})
extract_action = action_registry(extract_page_content={'value': 'text'})
done_action = action_registry(done={'text': 'Task completed'})
histories = [
AgentHistory(
model_output=AgentOutput(
current_state=AgentBrain(
evaluation_previous_goal='None',
memory='Started task',
next_goal='Click button',
),
action=[click_action],
),
result=[ActionResult(is_done=False)],
state=BrowserStateHistory(
url='https://example.com',
title='Page 1',
tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)],
screenshot='screenshot1.png',
interacted_element=[{'xpath': '//button[1]'}],
),
),
AgentHistory(
model_output=AgentOutput(
current_state=AgentBrain(
evaluation_previous_goal='Clicked button',
memory='Button clicked',
next_goal='Extract content',
),
action=[extract_action],
),
result=[
ActionResult(
is_done=False,
extracted_content='Extracted text',
error='Failed to extract completely',
)
],
state=BrowserStateHistory(
url='https://example.com/page2',
title='Page 2',
tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
screenshot='screenshot2.png',
interacted_element=[{'xpath': '//div[1]'}],
),
),
AgentHistory(
model_output=AgentOutput(
current_state=AgentBrain(
evaluation_previous_goal='Extracted content',
memory='Content extracted',
next_goal='Finish task',
),
action=[done_action],
),
result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)],
state=BrowserStateHistory(
url='https://example.com/page2',
title='Page 2',
tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
screenshot='screenshot3.png',
interacted_element=[{'xpath': '//div[1]'}],
),
),
]
return AgentHistoryList(history=histories)
def test_last_model_output(sample_history: AgentHistoryList):
last_output = sample_history.last_action()
print(last_output)
assert last_output == {'done': {'text': 'Task completed'}}
def test_get_errors(sample_history: AgentHistoryList):
errors = sample_history.errors()
assert len(errors) == 1
assert errors[0] == 'Failed to extract completely'
def test_final_result(sample_history: AgentHistoryList):
assert sample_history.final_result() == 'Task completed'
def test_is_done(sample_history: AgentHistoryList):
assert sample_history.is_done() is True
def test_urls(sample_history: AgentHistoryList):
urls = sample_history.urls()
assert 'https://example.com' in urls
assert 'https://example.com/page2' in urls
def test_all_screenshots(sample_history: AgentHistoryList):
screenshots = sample_history.screenshots()
assert len(screenshots) == 3
assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png']
def test_all_model_outputs(sample_history: AgentHistoryList):
outputs = sample_history.model_actions()
print(f'DEBUG: {outputs[0]}')
assert len(outputs) == 3
# get first key value pair
assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}}
assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}}
assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}}
def test_all_model_outputs_filtered(sample_history: AgentHistoryList):
filtered = sample_history.model_actions_filtered(include=['click_element'])
assert len(filtered) == 1
assert filtered[0]['click_element']['index'] == 1
def test_empty_history():
empty_history = AgentHistoryList(history=[])
assert empty_history.last_action() is None
assert empty_history.final_result() is None
assert empty_history.is_done() is False
assert len(empty_history.urls()) == 0
# Add a test to verify action creation
def test_action_creation(action_registry):
click_action = action_registry(click_element={'index': 1})
assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}}
# run this with:
# pytest browser_use/agent/tests.py

View File

@@ -1,4 +1,3 @@
import os
import sys
from collections.abc import Iterable
from enum import Enum
@@ -12,9 +11,9 @@ from pydantic import AfterValidator, AliasChoices, BaseModel, ConfigDict, Field,
from uuid_extensions import uuid7str
from browser_use.browser.types import ClientCertificate, Geolocation, HttpCredentials, ProxySettings, ViewportSize
from browser_use.config import CONFIG
from browser_use.utils import _log_pretty_path, logger
IN_DOCKER = os.environ.get('IN_DOCKER', 'false').lower()[0] in 'ty1'
CHROME_DEBUG_PORT = 9242 # use a non-default port to avoid conflicts with other tools / devs using 9222
CHROME_DISABLED_COMPONENTS = [
# Playwright defaults: https://github.com/microsoft/playwright/blob/41008eeddd020e2dee1c540f7c0cdfa337e99637/packages/playwright-core/src/server/chromium/chromiumSwitches.ts#L76
@@ -286,9 +285,7 @@ class BrowserChannel(str, Enum):
MSEDGE_CANARY = 'msedge-canary'
BROWSERUSE_CONFIG_DIR = Path('~/.config/browseruse').expanduser().resolve()
BROWSERUSE_PROFILES_DIR = BROWSERUSE_CONFIG_DIR / 'profiles'
BROWSERUSE_CHROMIUM_USER_DATA_DIR = BROWSERUSE_PROFILES_DIR / 'default'
# Using constants from central location in browser_use.config
BROWSERUSE_DEFAULT_CHANNEL = BrowserChannel.CHROMIUM
@@ -420,7 +417,7 @@ class BrowserLaunchArgs(BaseModel):
)
channel: BrowserChannel | None = None # https://playwright.dev/docs/browsers#chromium-headless-shell
chromium_sandbox: bool = Field(
default=not IN_DOCKER, description='Whether to enable Chromium sandboxing (recommended unless inside Docker).'
default=not CONFIG.IN_DOCKER, description='Whether to enable Chromium sandboxing (recommended unless inside Docker).'
)
devtools: bool = Field(
default=False, description='Whether to open DevTools panel automatically for every page, only works when headless=False.'
@@ -519,7 +516,7 @@ class BrowserLaunchPersistentContextArgs(BrowserLaunchArgs, BrowserContextArgs):
model_config = ConfigDict(extra='ignore', validate_assignment=False, revalidate_instances='always')
# Required parameter specific to launch_persistent_context, but can be None to use incognito temp dir
user_data_dir: str | Path | None = BROWSERUSE_CHROMIUM_USER_DATA_DIR
user_data_dir: str | Path | None = CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR
class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, BrowserLaunchArgs, BrowserNewContextArgs):
@@ -647,7 +644,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
"""
is_not_using_default_chromium = self.executable_path or self.channel not in (BROWSERUSE_DEFAULT_CHANNEL, None)
if self.user_data_dir == BROWSERUSE_CHROMIUM_USER_DATA_DIR and is_not_using_default_chromium:
if self.user_data_dir == CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR and is_not_using_default_chromium:
alternate_name = (
Path(self.executable_path).name.lower().replace(' ', '-')
if self.executable_path
@@ -658,7 +655,16 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
logger.warning(
f'⚠️ {self} Changing user_data_dir= {_log_pretty_path(self.user_data_dir)} ➡️ .../default-{alternate_name} to avoid {alternate_name.upper()} corruping default profile created by {BROWSERUSE_DEFAULT_CHANNEL.name}'
)
self.user_data_dir = BROWSERUSE_CHROMIUM_USER_DATA_DIR.parent / f'default-{alternate_name}'
self.user_data_dir = CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR.parent / f'default-{alternate_name}'
return self
@model_validator(mode='after')
def warn_deterministic_rendering_weirdness(self) -> Self:
if self.deterministic_rendering:
logger.warning(
'⚠️ BrowserSession(deterministic_rendering=True) is NOT RECOMMENDED. It breaks many sites and increases chances of getting blocked by anti-bot systems. '
'It hardcodes the JS random seed and forces browsers across Linux/Mac/Windows to use the same font rendering engine so that identical screenshots can be generated.'
)
return self
def get_args(self) -> list[str]:
@@ -676,7 +682,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
*default_args,
*self.args,
f'--profile-directory={self.profile_directory}',
*(CHROME_DOCKER_ARGS if IN_DOCKER else []),
*(CHROME_DOCKER_ARGS if CONFIG.IN_DOCKER else []),
*(CHROME_HEADLESS_ARGS if self.headless else []),
*(CHROME_DISABLE_SECURITY_ARGS if self.disable_security else []),
*(CHROME_DETERMINISTIC_RENDERING_ARGS if self.deterministic_rendering else []),

View File

@@ -1,11 +1,14 @@
from __future__ import annotations
import asyncio
import atexit
import base64
import json
import logging
import os
import re
import shutil
import tempfile
import time
from dataclasses import dataclass
from functools import wraps
@@ -13,6 +16,7 @@ from pathlib import Path
from typing import Any, Self
from urllib.parse import urlparse
from browser_use.config import CONFIG
from browser_use.utils import _log_pretty_path, _log_pretty_url
os.environ['PW_TEST_SCREENSHOT_NO_FONTS_READY'] = '1' # https://github.com/microsoft/playwright/issues/35972
@@ -45,10 +49,6 @@ from browser_use.dom.service import DomService
from browser_use.dom.views import DOMElementNode, SelectorMap
from browser_use.utils import match_url_with_domain_pattern, merge_dicts, time_execution_async, time_execution_sync
# Check if running in Docker
IN_DOCKER = os.environ.get('IN_DOCKER', 'false').lower()[0] in 'ty1'
_GLOB_WARNING_SHOWN = False # used inside _is_url_allowed to avoid spamming the logs with the same warning multiple times
GLOBAL_PLAYWRIGHT_API_OBJECT = None # never instantiate the playwright API object more than once per thread
@@ -83,7 +83,7 @@ def require_initialization(func):
if not self.agent_current_page or self.agent_current_page.is_closed():
self.agent_current_page = (
self.browser_context.pages[0] if (self.browser_context and self.browser_context.pages) else None
self.browser_context.pages[0] if (self.browser_context and len(self.browser_context.pages) > 0) else None
)
if not self.agent_current_page or self.agent_current_page.is_closed():
@@ -260,13 +260,13 @@ class BrowserSession(BaseModel):
"""
Starts the browser session by either connecting to an existing browser or launching a new one.
Precedence order for launching/connecting:
1. page=Page playwright object, will use its page.context as browser_context
2. browser_context=PlaywrightBrowserContext object, will use its browser
3. browser=PlaywrightBrowser object, will use its first available context
4. browser_pid=int, will connect to a local chromium-based browser via pid
5. wss_url=str, will connect to a remote playwright browser server via WSS
6. cdp_url=str, will connect to a remote chromium-based browser via CDP
7. playwright=Playwright object, will use its chromium instance to launch a new browser
1. page=Page playwright object, will use its page.context as browser_context
2. browser_context=PlaywrightBrowserContext object, will use its browser
3. browser=PlaywrightBrowser object, will use its first available context
4. browser_pid=int, will connect to a local chromium-based browser via pid
5. wss_url=str, will connect to a remote playwright browser server via WSS
6. cdp_url=str, will connect to a remote chromium-based browser via CDP
7. playwright=Playwright object, will use its chromium instance to launch a new browser
"""
# if we're already initialized and the connection is still valid, return the existing session state and start from scratch
@@ -276,7 +276,7 @@ class BrowserSession(BaseModel):
async with asyncio.timeout(60): # 60 second overall timeout for entire launching process to avoid deadlocks
async with self._start_lock: # prevent parallel calls to start() / stop() / save_storage_state() from clashing
if self.initialized:
if self.is_connected():
if await self.is_connected():
return self
else:
next_step = (
@@ -353,7 +353,7 @@ class BrowserSession(BaseModel):
async with self._start_lock:
# save cookies to disk if cookies_file or storage_state is configured
# but only if the browser context is still connected
if self.is_connected():
if await self.is_connected():
try:
await asyncio.wait_for(self.save_storage_state(), timeout=5)
except Exception as e:
@@ -414,9 +414,10 @@ class BrowserSession(BaseModel):
except TimeoutError:
self.logger.warning('⏱️ Timeout while closing browser/context, has it become unresponsive?')
except Exception as e:
self.logger.warning(
f'❌ Error closing playwright browser_context={self.browser_context}: {type(e).__name__}: {e}'
)
if 'browser has been closed' not in str(e):
self.logger.warning(
f'❌ Error closing playwright browser_context={self.browser_context}: {type(e).__name__}: {e}'
)
finally:
# Always clear references to ensure a fresh start next time
self.browser_context = None
@@ -426,7 +427,8 @@ class BrowserSession(BaseModel):
if self.browser_pid:
try:
proc = psutil.Process(pid=self.browser_pid)
executable_path = proc.cmdline()[0]
cmdline = proc.cmdline()
executable_path = cmdline[0] if cmdline else 'unknown'
self.logger.info(f' ↳ Killing browser_pid={self.browser_pid} {_log_pretty_path(executable_path)}')
# Add timeout for process termination
try:
@@ -440,12 +442,20 @@ class BrowserSession(BaseModel):
)
proc.kill() # Force kill if terminate didn't work
self.browser_pid = None
except psutil.NoSuchProcess:
self.browser_pid = None
except Exception as e:
if 'NoSuchProcess' not in type(e).__name__:
self.logger.debug(
f'❌ Error terminating subprocess with browser_pid={self.browser_pid}: {type(e).__name__}: {e}'
)
# if the user_data_dir is a temporary one, delete it
if self.browser_profile.user_data_dir and Path(self.browser_profile.user_data_dir).name.startswith(
'browseruse-tmp'
):
shutil.rmtree(self.browser_profile.user_data_dir, ignore_errors=True)
self._reset_connection_state()
# self.logger.debug('🛑 Shutdown complete.')
@@ -461,31 +471,8 @@ class BrowserSession(BaseModel):
self.browser_profile.keep_alive = False
await self.stop()
# Clean up playwright instance to prevent background tasks from running
if self.playwright:
try:
await self.playwright.stop()
# Give playwright tasks a moment to clean up properly
# This prevents "Task was destroyed but it is pending!" warnings
await asyncio.sleep(0.1)
# self.logger.debug('🎭 Stopped playwright node.js API worker')
except Exception as e:
self.logger.warning(f'❌ Error stopping playwright node.js API subprocess: {type(e).__name__}: {e}')
finally:
# Clear global references if they match this instance
global GLOBAL_PLAYWRIGHT_API_OBJECT, GLOBAL_PATCHRIGHT_API_OBJECT
global GLOBAL_PLAYWRIGHT_EVENT_LOOP, GLOBAL_PATCHRIGHT_EVENT_LOOP
if self.playwright == GLOBAL_PLAYWRIGHT_API_OBJECT:
GLOBAL_PLAYWRIGHT_API_OBJECT = None
GLOBAL_PLAYWRIGHT_EVENT_LOOP = None
# self.logger.debug('🧹 Cleared global playwright references')
elif self.playwright == GLOBAL_PATCHRIGHT_API_OBJECT:
GLOBAL_PATCHRIGHT_API_OBJECT = None
GLOBAL_PATCHRIGHT_EVENT_LOOP = None
# self.logger.debug('🧹 Cleared global patchright references')
self.playwright = None
# do not stop self.playwright here as its likely used by other parallel browser_sessions
# let it be cleaned up by the garbage collector when no refs use it anymore
async def new_context(self, **kwargs):
"""Deprecated: Provides backwards-compatibility with old class method Browser().new_context()."""
@@ -578,56 +565,6 @@ class BrowserSession(BaseModel):
GLOBAL_PLAYWRIGHT_EVENT_LOOP = current_loop
return GLOBAL_PLAYWRIGHT_API_OBJECT
def _kill_child_processes(self) -> None:
"""Kill any child processes that might be related to the browser"""
if not self.browser_profile.keep_alive and self.browser_pid:
try:
browser_proc = psutil.Process(self.browser_pid)
try:
browser_proc.terminate()
browser_proc.wait(
timeout=5
) # wait up to 5 seconds for the process to exit cleanly and commit its user_data_dir changes
except (psutil.NoSuchProcess, psutil.AccessDenied, TimeoutError):
pass
# Kill all child processes first (recursive)
for child in browser_proc.children(recursive=True):
try:
# self.logger.debug(f'Force killing child process: {child.pid} ({child.name()})')
child.kill()
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
# Kill the main browser process
# self.logger.debug(f'Force killing browser process: {self.browser_pid}')
browser_proc.kill()
except psutil.NoSuchProcess:
pass
except Exception as e:
self.logger.warning(f'Error force-killing browser in BrowserSession.__del__: {type(e).__name__}: {e}')
@staticmethod
async def _start_global_playwright_subprocess(is_stealth: bool) -> PlaywrightOrPatchright:
"""Create and return a new playwright or patchright node.js subprocess / API connector"""
global GLOBAL_PLAYWRIGHT_API_OBJECT, GLOBAL_PATCHRIGHT_API_OBJECT
global GLOBAL_PLAYWRIGHT_EVENT_LOOP, GLOBAL_PATCHRIGHT_EVENT_LOOP
try:
current_loop = asyncio.get_running_loop()
except RuntimeError:
current_loop = None
if is_stealth:
GLOBAL_PATCHRIGHT_API_OBJECT = await async_patchright().start()
GLOBAL_PATCHRIGHT_EVENT_LOOP = current_loop
return GLOBAL_PATCHRIGHT_API_OBJECT
else:
GLOBAL_PLAYWRIGHT_API_OBJECT = await async_playwright().start()
GLOBAL_PLAYWRIGHT_EVENT_LOOP = current_loop
return GLOBAL_PLAYWRIGHT_API_OBJECT
async def setup_playwright(self) -> None:
"""
Set up playwright library client object: usually the result of (await async_playwright().start())
@@ -694,6 +631,22 @@ class BrowserSession(BaseModel):
if self.browser_profile.headless or not self.browser_profile.no_viewport:
self.logger.info(' 🪄 For maximum stealth, BrowserSession(...) should be passed headless=False & viewport=None')
# register a shutdown hook to stop the shared global playwright node.js client when the program exits (if an event loop is still running)
def shudown_playwright():
if not self.playwright:
return
try:
loop = asyncio.get_running_loop()
self.logger.debug('🛑 Shutting down shared global playwright node.js client')
task = loop.create_task(self.playwright.stop())
if hasattr(task, '_log_destroy_pending'):
task._log_destroy_pending = False # type: ignore
except Exception:
pass
self.playwright = None
atexit.register(shudown_playwright)
async def setup_browser_via_passed_objects(self) -> None:
"""Override to customize the set up of the connection to an existing browser"""
@@ -735,14 +688,36 @@ class BrowserSession(BaseModel):
if not self.browser_pid:
return # no browser_pid provided, nothing to do
chrome_process = psutil.Process(pid=self.browser_pid)
assert chrome_process.is_running(), 'Chrome process is not running'
args = chrome_process.cmdline()
# check that browser_pid process is running, otherwise we cannot connect to it
try:
chrome_process = psutil.Process(pid=self.browser_pid)
if not chrome_process.is_running():
self.logger.warning(f'Chrome process with pid={self.browser_pid} is not running')
return
args = chrome_process.cmdline()
except psutil.NoSuchProcess:
self.logger.warning(f'Chrome process with pid={self.browser_pid} not found')
return
except Exception as e:
self.browser_pid = None
self.logger.warning(f'Error accessing chrome process with pid={self.browser_pid}: {type(e).__name__}: {e}')
return
# check that browser_pid process is exposing a debug port we can connect to, otherwise we cannot connect to it
debug_port = next((arg for arg in args if arg.startswith('--remote-debugging-port=')), '').split('=')[-1].strip()
assert debug_port, (
f'Could not find --remote-debugging-port=... to connect to in browser launch args: browser_pid={self.browser_pid} {args}'
)
# we could automatically relaunch the browser process with that arg added here, but they may have tabs open they dont want to lose
if not debug_port:
# provided pid is unusable, it's either not running or doesnt have an open debug port we can connect to
if '--remote-debugging-pipe' in args:
self.logger.error(
f'❌ Found --remote-debugging-pipe in browser launch args for browser_pid={self.browser_pid} but it was started by a different BrowserSession, cannot connect to it'
)
else:
self.logger.error(
f'❌ Could not find --remote-debugging-port=... to connect to in browser launch args for browser_pid={self.browser_pid}: {" ".join(args)}'
)
self.browser_pid = None
return
self.cdp_url = self.cdp_url or f'http://localhost:{debug_port}/'
self.logger.info(f'🌎 Connecting to existing local browser process: browser_pid={self.browser_pid} on {self.cdp_url}')
assert self.playwright is not None, 'playwright instance is None'
@@ -815,112 +790,86 @@ class BrowserSession(BaseModel):
f'{str(type(self.playwright).__module__).split(".")[0]}:{self.browser_profile.channel.name.lower()} keep_alive={self.browser_profile.keep_alive or False} '
f'user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir) or "<incognito>"}'
)
# if no user_data_dir is provided, generate a unique one for this temporary browser_context (will be used to uniquely identify the browser_pid later)
if not self.browser_profile.user_data_dir:
# self.logger.debug('🌎 Launching local browser in incognito mode')
# if no user_data_dir is provided, launch an incognito context with no persistent user_data_dir
try:
assert self.playwright is not None, 'playwright instance is None'
async with asyncio.timeout(10): # Reduced timeout from 30s to 10s
self.browser = self.browser or await self.playwright.chromium.launch(
**self.browser_profile.kwargs_for_launch().model_dump()
)
# self.logger.debug('🌎 Launching new incognito context in browser')
async with asyncio.timeout(10): # Reduced timeout from 30s to 10s
self.browser_context = await self.browser.new_context(
**self.browser_profile.kwargs_for_new_context().model_dump(mode='json')
)
except TimeoutError:
self.logger.warning(
'Browser operation timed out. This may indicate the playwright instance is invalid due to event loop changes. '
'Recreating playwright instance and retrying...'
)
# Force recreation of the playwright object
self.playwright = await self._start_global_playwright_subprocess(is_stealth=self.browser_profile.stealth)
# Retry the operation with the new playwright instance
assert self.playwright is not None, 'playwright instance is None'
async with asyncio.timeout(10):
self.browser = await self.playwright.chromium.launch(
**self.browser_profile.kwargs_for_launch().model_dump()
)
async with asyncio.timeout(10):
self.browser_context = await self.browser.new_context(
**self.browser_profile.kwargs_for_new_context().model_dump()
)
# self.logger.debug('🌎 Created new incognito context in browser')
else:
# user data dir was provided, prepare it for use
self.prepare_user_data_dir()
# if no user_data_dir is provided, generate a unique one for this temporary browser_context (will be used to uniquely identify the browser_pid later)
self.browser_profile.user_data_dir = self.browser_profile.user_data_dir or Path(
tempfile.mkdtemp(prefix='browseruse-tmp-')
)
# search for potentially conflicting local processes running on the same user_data_dir
for proc in psutil.process_iter(['pid', 'cmdline']):
if f'--user-data-dir={self.browser_profile.user_data_dir}' in (proc.info['cmdline'] or []):
self.logger.error(
f'🚨 Found potentially conflicting browser process browser_pid={proc.info["pid"]} '
f'already running with the same user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)}'
)
break
# user data dir was provided, prepare it for use
self.prepare_user_data_dir()
# if a user_data_dir is provided, launch a persistent context with that user_data_dir
try:
async with asyncio.timeout(10): # Reduced timeout from 30s to 10s
try:
assert self.playwright is not None, 'playwright instance is None'
self.browser_context = await self.playwright.chromium.launch_persistent_context(
**self.browser_profile.kwargs_for_launch_persistent_context().model_dump(mode='json')
)
except Exception as e:
# Re-raise if not a timeout
if not isinstance(e, asyncio.TimeoutError):
raise
except TimeoutError:
self.logger.warning(
'Browser operation timed out. This may indicate the playwright instance is invalid due to event loop changes. '
'Recreating playwright instance and retrying...'
# search for potentially conflicting local processes running on the same user_data_dir
for proc in psutil.process_iter(['pid', 'cmdline']):
if f'--user-data-dir={self.browser_profile.user_data_dir}' in (proc.info['cmdline'] or []):
self.logger.error(
f'🚨 Found potentially conflicting browser process browser_pid={proc.info["pid"]} '
f'already running with the same user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)}'
)
# Force recreation of the playwright object
self.playwright = await self._start_global_playwright_subprocess(is_stealth=self.browser_profile.stealth)
# Retry the operation with the new playwright instance
async with asyncio.timeout(10):
break
# if a user_data_dir is provided, launch a persistent context with that user_data_dir
try:
async with asyncio.timeout(self.browser_profile.timeout / 1000):
try:
assert self.playwright is not None, 'playwright instance is None'
self.browser_context = await self.playwright.chromium.launch_persistent_context(
**self.browser_profile.kwargs_for_launch_persistent_context().model_dump()
**self.browser_profile.kwargs_for_launch_persistent_context().model_dump(mode='json')
)
except Exception as e:
# show a nice logger hint explaining what went wrong with the user_data_dir
# calculate the version of the browser that the user_data_dir is for, and the version of the browser we are running with
user_data_dir_chrome_version = '???'
test_browser_version = '???'
try:
# user_data_dir is corrupted or unreadable because it was migrated to a newer version of chrome than we are running with
user_data_dir_chrome_version = (
(Path(self.browser_profile.user_data_dir) / 'Last Version').read_text().strip()
)
except Exception:
pass # let the logger below handle it
try:
assert self.playwright is not None, 'playwright instance is None'
test_browser = await self.playwright.chromium.launch(headless=True)
test_browser_version = test_browser.version
await test_browser.close()
except Exception:
pass
except Exception as e:
# Re-raise if not a timeout
if not isinstance(e, asyncio.TimeoutError):
raise
except TimeoutError:
self.logger.warning(
'Browser operation timed out. This may indicate the playwright instance is invalid due to event loop changes. '
'Recreating playwright instance and retrying...'
)
# Force recreation of the playwright object
self.playwright = await self._start_global_playwright_subprocess(is_stealth=self.browser_profile.stealth)
# Retry the operation with the new playwright instance
async with asyncio.timeout(self.browser_profile.timeout / 1000):
assert self.playwright is not None, 'playwright instance is None'
self.browser_context = await self.playwright.chromium.launch_persistent_context(
**self.browser_profile.kwargs_for_launch_persistent_context().model_dump()
)
except Exception as e:
# show a nice logger hint explaining what went wrong with the user_data_dir
# calculate the version of the browser that the user_data_dir is for, and the version of the browser we are running with
user_data_dir_chrome_version = '???'
test_browser_version = '???'
try:
# user_data_dir is corrupted or unreadable because it was migrated to a newer version of chrome than we are running with
user_data_dir_chrome_version = (Path(self.browser_profile.user_data_dir) / 'Last Version').read_text().strip()
except Exception:
pass # let the logger below handle it
try:
assert self.playwright is not None, 'playwright instance is None'
test_browser = await self.playwright.chromium.launch(headless=True)
test_browser_version = test_browser.version
await test_browser.close()
except Exception:
pass
# failed to parse extensions == most common error text when user_data_dir is corrupted / has an unusable schema
reason = 'due to bad' if 'Failed parsing extensions' in str(e) else 'for unknown reason with'
driver = str(type(self.playwright).__module__).split('.')[0].lower()
browser_channel = (
Path(self.browser_profile.executable_path).name.replace(' ', '-').replace('.exe', '').lower()
if self.browser_profile.executable_path
else (self.browser_profile.channel or BROWSERUSE_DEFAULT_CHANNEL).name.lower()
)
self.logger.error(
f'❌ Launching new local browser {driver}:{browser_channel} (v{test_browser_version}) failed!'
f'\n\tFailed {reason} user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)} (created with v{user_data_dir_chrome_version})'
'\n\tTry using a different browser version/channel or delete the user_data_dir to start over with a fresh profile.'
'\n\t(can happen if different versions of Chrome/Chromium/Brave/etc. tried to share one dir)'
f'\n\n{type(e).__name__} {e}'
)
raise
# failed to parse extensions == most common error text when user_data_dir is corrupted / has an unusable schema
reason = 'due to bad' if 'Failed parsing extensions' in str(e) else 'for unknown reason with'
driver = str(type(self.playwright).__module__).split('.')[0].lower()
browser_channel = (
Path(self.browser_profile.executable_path).name.replace(' ', '-').replace('.exe', '').lower()
if self.browser_profile.executable_path
else (self.browser_profile.channel or BROWSERUSE_DEFAULT_CHANNEL).name.lower()
)
self.logger.error(
f'❌ Launching new local browser {driver}:{browser_channel} (v{test_browser_version}) failed!'
f'\n\tFailed {reason} user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)} (created with v{user_data_dir_chrome_version})'
'\n\tTry using a different browser version/channel or delete the user_data_dir to start over with a fresh profile.'
'\n\t(can happen if different versions of Chrome/Chromium/Brave/etc. tried to share one dir)'
f'\n\n{type(e).__name__} {e}'
)
raise
# Only restore browser from context if it's connected, otherwise keep it None to force new launch
browser_from_context = self.browser_context and self.browser_context.browser
@@ -930,22 +879,59 @@ class BrowserSession(BaseModel):
# playwright does not give us a browser object at all when we use launch_persistent_context()!
# Detect any new child chrome processes that we might have launched above
try:
child_pids_after_launch = {child.pid for child in current_process.children(recursive=True)}
new_child_pids = child_pids_after_launch - child_pids_before_launch
new_child_procs = [psutil.Process(pid) for pid in new_child_pids]
new_chrome_procs = [proc for proc in new_child_procs if 'Helper' not in proc.name() and proc.status() == 'running']
except Exception as e:
self.logger.debug(
f'❌ Error trying to find child chrome processes after launching new browser: {type(e).__name__}: {e}'
)
new_chrome_procs = []
def is_our_chrome_proc(pid: int) -> psutil.Process | None:
try:
proc = psutil.Process(pid)
cmdline = proc.cmdline()
if 'Helper' in proc.name():
return None
if proc.status() != 'running':
return None
if (
self.browser_profile.executable_path
and Path(cmdline[0]).expanduser().resolve()
!= Path(self.browser_profile.executable_path).expanduser().resolve()
):
# self.logger.debug(f'❌ Found new child chrome process that does not match our executable: {str(cmdline)[:50]}')
return None
if (
self.browser_profile.user_data_dir
and f'--user-data-dir={Path(self.browser_profile.user_data_dir).expanduser().resolve()}' in cmdline
):
# self.logger.debug(f'✅ Found new child chrome process that matches our user_data_dir: {str(cmdline)[:50]}')
return proc
else:
# self.logger.debug(f'❌ Found new child chrome process that does not match our user_data_dir: {[arg for arg in cmdline if "--user-data-dir=" in arg]}')
return None
except Exception:
pass
return None
if new_chrome_procs and not self.browser_pid:
self.browser_pid = new_chrome_procs[0].pid
self.logger.info(f' ↳ Spawned browser_pid={self.browser_pid} {_log_pretty_path(new_chrome_procs[0].cmdline()[0])}')
self.logger.debug(' '.join(new_chrome_procs[0].cmdline())) # print the entire launch command for debugging
self._set_browser_keep_alive(False) # close the browser at the end because we launched it
child_pids_after_launch = {child.pid for child in current_process.children(recursive=True)}
new_child_pids = child_pids_after_launch - child_pids_before_launch
new_child_procs = list(filter(bool, (is_our_chrome_proc(pid) for pid in new_child_pids)))
if not new_child_procs:
self.logger.debug(f'❌ Failed to find any new child chrome processes after launching new browser: {new_child_pids}')
new_chrome_proc = None
elif len(new_child_procs) > 1:
self.logger.debug(f'❌ Found multiple new child chrome processes after launching new browser: {new_child_procs}')
new_chrome_proc = None
else:
new_chrome_proc = new_child_procs[0]
if new_chrome_proc and not self.browser_pid:
# look through the discovered new chrome processes to uniquely identify the one that *we* launched,
# match using unique user_data_dir
try:
self.browser_pid = new_chrome_proc.pid
cmdline = new_chrome_proc.cmdline()
executable_path = cmdline[0] if cmdline else 'unknown'
self.logger.info(f' ↳ Spawned browser_pid={self.browser_pid} {_log_pretty_path(executable_path)}')
if cmdline:
self.logger.debug(' '.join(cmdline)) # print the entire launch command for debugging
self._set_browser_keep_alive(False) # close the browser at the end because we launched it
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
self.logger.warning(f'Browser process {self.browser_pid} died immediately after launch: {type(e).__name__}')
if self.browser:
assert self.browser.is_connected(), (
@@ -1071,7 +1057,7 @@ class BrowserSession(BaseModel):
if pages:
foreground_page = pages[0]
self.logger.debug(
f'👁️‍🗨️ Found {len(pages)} existing tabs in browser, agent session {self.id[-4:]}.{str(id(self.agent_current_page))[-2:]} will start focused on Tab [{pages.index(foreground_page)}]: {foreground_page.url}'
f'👁️‍🗨️ Found {len(pages)} existing tabs in browser, agent session {self.id[-4:]}.{str(id(self.agent_current_page))[-2:]} will start focused on Tab [{pages.index(foreground_page)}]: {foreground_page.url}' # type: ignore
)
else:
foreground_page = await self.browser_context.new_page()
@@ -1090,15 +1076,15 @@ class BrowserSession(BaseModel):
old_foreground = self.human_current_page
assert self.browser_context is not None, 'BrowserContext object is not set'
assert old_foreground is not None, 'Old foreground page is not set'
old_tab_idx = self.browser_context.pages.index(old_foreground)
old_tab_idx = self.browser_context.pages.index(old_foreground) # type: ignore
self.human_current_page = new_page
new_tab_idx = self.browser_context.pages.index(new_page)
new_tab_idx = self.browser_context.pages.index(new_page) # type: ignore
# Log before and after for debugging
old_url = old_foreground and old_foreground.url or 'about:blank'
new_url = new_page and new_page.url or 'about:blank'
agent_url = self.agent_current_page and self.agent_current_page.url or 'about:blank'
agent_tab_idx = self.browser_context.pages.index(self.agent_current_page)
agent_tab_idx = self.browser_context.pages.index(self.agent_current_page) # type: ignore
if old_url != new_url:
self.logger.info(
f'👁️ Foregound tab changed by human from [{old_tab_idx}]{_log_pretty_url(old_url)} '
@@ -1167,7 +1153,7 @@ class BrowserSession(BaseModel):
await page.evaluate(update_tab_focus_script)
# self.logger.debug(f'👁️ Added visibility listener to existing tab: {page.url}')
except Exception as e:
page_idx = self.browser_context.pages.index(page)
page_idx = self.browser_context.pages.index(page) # type: ignore
self.logger.debug(
f'⚠️ Failed to add visibility listener to existing tab, is it crashed or ignoring CDP commands?: [{page_idx}]{page.url}: {type(e).__name__}: {e}'
)
@@ -1258,7 +1244,7 @@ class BrowserSession(BaseModel):
# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
try:
cdp_session = await page.context.new_cdp_session(page)
cdp_session = await page.context.new_cdp_session(page) # type: ignore
window_id_result = await cdp_session.send('Browser.getWindowForTarget')
await cdp_session.send(
'Browser.setWindowBounds',
@@ -1277,7 +1263,7 @@ class BrowserSession(BaseModel):
# fallback to javascript resize if cdp setWindowBounds fails
await page.evaluate(
"""(width, height) => {window.resizeTo(width, height)}""",
**self.browser_profile.window_size,
[self.browser_profile.window_size['width'], self.browser_profile.window_size['height']],
)
return
except Exception:
@@ -1292,7 +1278,7 @@ class BrowserSession(BaseModel):
if self.browser_profile.keep_alive is None:
self.browser_profile.keep_alive = keep_alive
def is_connected(self) -> bool:
async def is_connected(self) -> bool:
"""
Check if the browser session has valid, connected browser and context objects.
Returns False if any of the following conditions are met:
@@ -1309,8 +1295,14 @@ class BrowserSession(BaseModel):
# Check if the browser_context itself is closed/unusable
try:
_ = self.browser_context.pages
return True
# TODO: figure out a better synchronous test for whether browser_context is usable
# this is a hacky workaround for the fact that playwright's browser_context has no is_connected() method
# and browser_context.browser is None when we launch with a persistent context (basically always)
if self.browser_context.pages:
return True
else:
await self.create_new_tab()
return True
except Exception:
return False
@@ -1435,7 +1427,7 @@ class BrowserSession(BaseModel):
self.agent_current_page = first_available_tab
self.human_current_page = first_available_tab
else:
# if all tabs are closed, open a new one
# if all tabs are closed, open a new one, never allow a context with 0 tabs
new_tab = await self.create_new_tab()
self.agent_current_page = new_tab
self.human_current_page = new_tab
@@ -1480,30 +1472,25 @@ class BrowserSession(BaseModel):
"""
page = await self.get_current_page()
try:
script = """
try {
// Remove the highlight container and all its contents
const container = document.getElementById('playwright-highlight-container');
if (container) {
container.remove();
}
// Remove highlight attributes from elements
const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]');
highlightedElements.forEach(el => {
el.removeAttribute('browser-user-highlight-id');
});
} catch (e) {
console.error('Failed to remove highlights:', e);
}
"""
await page.evaluate(script)
for iframe in page.frames:
if iframe.url and iframe.url != page.url and not iframe.url.startswith('data:'):
await iframe.evaluate(script)
await page.evaluate(
"""
try {
// Remove the highlight container and all its contents
const container = document.getElementById('playwright-highlight-container');
if (container) {
container.remove();
}
// Remove highlight attributes from elements
const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]');
highlightedElements.forEach(el => {
el.removeAttribute('browser-user-highlight-id');
});
} catch (e) {
console.error('Failed to remove highlights:', e);
}
"""
)
except Exception as e:
self.logger.debug(f'⚠️ Failed to remove highlights (this is usually ok): {type(e).__name__}: {e}')
# Don't raise the error since this is not critical functionality
@@ -1650,6 +1637,8 @@ class BrowserSession(BaseModel):
page = await self.get_current_page()
else:
# otherwise close the tab at the given index
if tab_index >= len(pages) or tab_index < 0:
raise IndexError(f'Tab index {tab_index} out of range. Available tabs: {len(pages)}')
page = pages[tab_index]
await page.close()
@@ -2336,9 +2325,9 @@ class BrowserSession(BaseModel):
Parameters:
-----------
cache_clickable_elements_hashes: bool
If True, cache the clickable elements hashes for the current state.
This is used to calculate which elements are new to the LLM since the last message,
which helps reduce token usage.
If True, cache the clickable elements hashes for the current state.
This is used to calculate which elements are new to the LLM since the last message,
which helps reduce token usage.
"""
await self._wait_for_page_and_frames_load()
updated_state = await self._get_updated_state()
@@ -2621,10 +2610,10 @@ class BrowserSession(BaseModel):
Creates a CSS selector for a DOM element, handling various edge cases and special characters.
Args:
element: The DOM element to create a selector for
element: The DOM element to create a selector for
Returns:
A valid CSS selector string
A valid CSS selector string
"""
try:
# Get base selector from XPath
@@ -2919,10 +2908,6 @@ class BrowserSession(BaseModel):
Handles different types of input fields and ensures proper element state before input.
"""
try:
# Highlight before typing
# if element_node.highlight_index is not None:
# await self._update_state(focus_element=element_node.highlight_index)
element_handle = await self.get_locate_element(element_node)
if element_handle is None:
@@ -2937,6 +2922,18 @@ class BrowserSession(BaseModel):
except Exception:
pass
# let's first try to click and type
try:
await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}')
await element_handle.click()
await asyncio.sleep(0.1) # Increased sleep time
page = await self.get_current_page()
await page.keyboard.type(text)
return
except Exception as e:
self.logger.debug(f'Input text with click and type failed, trying element handle method: {e}')
pass
# Get element properties to determine input method
tag_handle = await element_handle.get_property('tagName')
tag_name = (await tag_handle.json_value()).lower()
@@ -2947,25 +2944,15 @@ class BrowserSession(BaseModel):
readonly = await readonly_handle.json_value() if readonly_handle else False
disabled = await disabled_handle.json_value() if disabled_handle else False
# always click the element first to make sure it's in the focus
await element_handle.click()
await asyncio.sleep(0.1)
try:
if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled):
await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}')
await element_handle.type(text, delay=5)
else:
await element_handle.fill(text)
except Exception:
# last resort fallback, assume it's already focused after we clicked on it,
# just simulate keypresses on the entire page
try:
page = await self.get_current_page()
await page.keyboard.type(text)
except Exception as fallback_error:
# If we can't even get the current page, re-raise with a clear error
raise BrowserError(f'Failed to input text into element: {element_node.xpath}') from fallback_error
except Exception as e:
self.logger.error(f'Error during input text into element: {type(e).__name__}: {e}')
raise BrowserError(f'Failed to input text into element: {repr(element_node)}')
except Exception as e:
# Get current page URL safely for error message
@@ -3033,9 +3020,9 @@ class BrowserSession(BaseModel):
except Exception:
self.initialized = False
if not self.initialized or not self.is_connected():
if not self.initialized or not self.browser_context:
# If we were initialized but lost connection, reset state first to avoid infinite loops
if self.initialized and not self.is_connected():
if self.initialized and not self.browser_context:
self.logger.warning(
f'💔 Browser {self._connection_str} disconnected while trying to create a new tab, reconnecting...'
)
@@ -3068,7 +3055,7 @@ class BrowserSession(BaseModel):
await new_page.goto(url, wait_until='domcontentloaded')
await self._wait_for_page_and_frames_load(timeout_overwrite=1)
except Exception as e:
self.logger.error(f'❌ Error navigating to {url}: {type(e).__name__}: {e}')
self.logger.error(f'❌ Error navigating to {url}: {type(e).__name__}: {e} (proceeding anyway...)')
assert self.human_current_page is not None
assert self.agent_current_page is not None
@@ -3109,6 +3096,23 @@ class BrowserSession(BaseModel):
element_handle = await self.get_locate_element(selector_map[index])
return element_handle
async def is_file_input_by_index(self, index: int) -> bool:
try:
selector_map = await self.get_selector_map()
node = selector_map[index]
return self.is_file_input(node)
except Exception as e:
self.logger.debug(f'❌ Error in is_file_input(index={index}): {type(e).__name__}: {e}')
return False
@staticmethod
def is_file_input(node: DOMElementNode) -> bool:
return (
isinstance(node, DOMElementNode)
and getattr(node, 'tag_name', '').lower() == 'input'
and node.attributes.get('type', '').lower() == 'file'
)
@require_initialization
async def find_file_upload_element_by_index(
self, index: int, max_height: int = 3, max_descendant_depth: int = 3
@@ -3128,17 +3132,10 @@ class BrowserSession(BaseModel):
candidate_element = selector_map[index]
def is_file_input(node: DOMElementNode) -> bool:
return (
isinstance(node, DOMElementNode)
and getattr(node, 'tag_name', '').lower() == 'input'
and node.attributes.get('type', '').lower() == 'file'
)
def find_file_input_in_descendants(node: DOMElementNode, depth: int) -> DOMElementNode | None:
if depth < 0 or not isinstance(node, DOMElementNode):
return None
if is_file_input(node):
if self.is_file_input(node):
return node
for child in getattr(node, 'children', []):
result = find_file_input_in_descendants(child, depth - 1)
@@ -3149,7 +3146,7 @@ class BrowserSession(BaseModel):
current = candidate_element
for _ in range(max_height + 1): # include the candidate itself
# 1. Check the current node itself
if is_file_input(current):
if self.is_file_input(current):
return current
# 2. Check all descendants of the current node
result = find_file_input_in_descendants(current, max_descendant_depth)
@@ -3161,7 +3158,7 @@ class BrowserSession(BaseModel):
for sibling in getattr(parent, 'children', []):
if sibling is current:
continue
if is_file_input(sibling):
if self.is_file_input(sibling):
return sibling
result = find_file_input_in_descendants(sibling, max_descendant_depth)
if result:
@@ -3226,7 +3223,7 @@ class BrowserSession(BaseModel):
Injects a DVD screensaver-style bouncing logo loading animation overlay into the given Playwright Page.
This is used to visually indicate that the browser is setting up or waiting.
"""
if os.environ.get('IS_IN_EVALS', 'false').lower()[0] in 'ty1':
if CONFIG.IS_IN_EVALS:
# dont bother wasting CPU showing animations during evals
return

View File

@@ -1,3 +1,4 @@
# pyright: reportMissingImports=false
import asyncio
import json
import logging
@@ -39,20 +40,17 @@ os.environ['BROWSER_USE_LOGGING_LEVEL'] = 'result'
from browser_use import Agent, Controller
from browser_use.agent.views import AgentSettings
from browser_use.browser import BrowserSession
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.config import CONFIG
from browser_use.logging_config import addLoggingLevel
# Paths
USER_CONFIG_DIR = Path.home() / '.config' / 'browseruse'
USER_CONFIG_FILE = USER_CONFIG_DIR / 'config.json'
CHROME_PROFILES_DIR = USER_CONFIG_DIR / 'profiles'
USER_DATA_DIR = CHROME_PROFILES_DIR / 'cli'
USER_DATA_DIR = CONFIG.BROWSER_USE_PROFILES_DIR / 'cli'
# Default User settings
MAX_HISTORY_LENGTH = 100
# Ensure directories exist
USER_CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
CONFIG.BROWSER_USE_CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
USER_DATA_DIR.mkdir(parents=True, exist_ok=True)
@@ -90,11 +88,11 @@ def get_default_config() -> dict[str, Any]:
'name': None,
'temperature': 0.0,
'api_keys': {
'OPENAI_API_KEY': os.getenv('OPENAI_API_KEY', ''),
'ANTHROPIC_API_KEY': os.getenv('ANTHROPIC_API_KEY', ''),
'GOOGLE_API_KEY': os.getenv('GOOGLE_API_KEY', ''),
'DEEPSEEK_API_KEY': os.getenv('DEEPSEEK_API_KEY', ''),
'GROK_API_KEY': os.getenv('GROK_API_KEY', ''),
'OPENAI_API_KEY': CONFIG.OPENAI_API_KEY,
'ANTHROPIC_API_KEY': CONFIG.ANTHROPIC_API_KEY,
'GOOGLE_API_KEY': CONFIG.GOOGLE_API_KEY,
'DEEPSEEK_API_KEY': CONFIG.DEEPSEEK_API_KEY,
'GROK_API_KEY': CONFIG.GROK_API_KEY,
},
},
'agent': {}, # AgentSettings will use defaults
@@ -109,14 +107,14 @@ def get_default_config() -> dict[str, Any]:
def load_user_config() -> dict[str, Any]:
"""Load user configuration from file."""
if not USER_CONFIG_FILE.exists():
if not CONFIG.BROWSER_USE_CONFIG_FILE.exists():
# Create default config
config = get_default_config()
save_user_config(config)
return config
try:
with open(USER_CONFIG_FILE) as f:
with open(CONFIG.BROWSER_USE_CONFIG_FILE) as f:
data = json.load(f)
# Ensure data is a dictionary, not a list
if isinstance(data, list):
@@ -137,7 +135,7 @@ def save_user_config(config: dict[str, Any]) -> None:
if len(config['command_history']) > MAX_HISTORY_LENGTH:
config['command_history'] = config['command_history'][-MAX_HISTORY_LENGTH:]
with open(USER_CONFIG_FILE, 'w') as f:
with open(CONFIG.BROWSER_USE_CONFIG_FILE, 'w') as f:
json.dump(config, f, indent=2)
@@ -186,36 +184,38 @@ def get_llm(config: dict[str, Any]):
temperature = config.get('model', {}).get('temperature', 0.0)
# Set environment variables if they're in the config but not in the environment
if api_keys.get('openai') and not os.getenv('OPENAI_API_KEY'):
if api_keys.get('openai') and not CONFIG.OPENAI_API_KEY:
os.environ['OPENAI_API_KEY'] = api_keys['openai']
if api_keys.get('anthropic') and not os.getenv('ANTHROPIC_API_KEY'):
if api_keys.get('anthropic') and not CONFIG.ANTHROPIC_API_KEY:
os.environ['ANTHROPIC_API_KEY'] = api_keys['anthropic']
if api_keys.get('google') and not os.getenv('GOOGLE_API_KEY'):
if api_keys.get('google') and not CONFIG.GOOGLE_API_KEY:
os.environ['GOOGLE_API_KEY'] = api_keys['google']
if model_name:
if model_name.startswith('gpt'):
if not os.getenv('OPENAI_API_KEY'):
if not CONFIG.OPENAI_API_KEY:
print('⚠️ OpenAI API key not found. Please update your config or set OPENAI_API_KEY environment variable.')
sys.exit(1)
return langchain_openai.ChatOpenAI(model=model_name, temperature=temperature)
elif model_name.startswith('claude'):
if not os.getenv('ANTHROPIC_API_KEY'):
if not CONFIG.ANTHROPIC_API_KEY:
print('⚠️ Anthropic API key not found. Please update your config or set ANTHROPIC_API_KEY environment variable.')
sys.exit(1)
return langchain_anthropic.ChatAnthropic(model=model_name, temperature=temperature)
return langchain_anthropic.ChatAnthropic(model_name=model_name, temperature=temperature, timeout=30, stop=None)
elif model_name.startswith('gemini'):
if not os.getenv('GOOGLE_API_KEY'):
if not CONFIG.GOOGLE_API_KEY:
print('⚠️ Google API key not found. Please update your config or set GOOGLE_API_KEY environment variable.')
sys.exit(1)
return langchain_google_genai.ChatGoogleGenerativeAI(model=model_name, temperature=temperature)
# Auto-detect based on available API keys
if os.getenv('OPENAI_API_KEY'):
if CONFIG.OPENAI_API_KEY:
return langchain_openai.ChatOpenAI(model='gpt-4o', temperature=temperature)
elif os.getenv('ANTHROPIC_API_KEY'):
return langchain_anthropic.ChatAnthropic(model='claude-3.5-sonnet-exp', temperature=temperature)
elif os.getenv('GOOGLE_API_KEY'):
elif CONFIG.ANTHROPIC_API_KEY:
return langchain_anthropic.ChatAnthropic(
model_name='claude-3.5-sonnet-exp', temperature=temperature, timeout=30, stop=None
)
elif CONFIG.GOOGLE_API_KEY:
return langchain_google_genai.ChatGoogleGenerativeAI(model='gemini-2.0-flash-lite', temperature=temperature)
else:
print(
@@ -420,10 +420,10 @@ class BrowserUseApp(App):
def __init__(self, config: dict[str, Any], *args, **kwargs):
super().__init__(*args, **kwargs)
self.config = config
self.browser_session = None # Will be set before app.run_async()
self.controller = None # Will be set before app.run_async()
self.agent = None
self.llm = None # Will be set before app.run_async()
self.browser_session: BrowserSession | None = None # Will be set before app.run_async()
self.controller: Controller | None = None # Will be set before app.run_async()
self.agent: Agent | None = None
self.llm: Any | None = None # Will be set before app.run_async()
self.task_history = config.get('command_history', [])
# Track current position in history for up/down navigation
self.history_index = len(self.task_history)
@@ -437,7 +437,7 @@ class BrowserUseApp(App):
pass # Level already exists, which is fine
# Get the RichLog widget
rich_log = self.query_one('#results-log')
rich_log = self.query_one('#results-log', RichLog)
# Create and set up the custom handler
log_handler = RichLogHandler(rich_log)
@@ -530,7 +530,7 @@ class BrowserUseApp(App):
# Step 3: Focus the input field
logger.debug('Focusing input field...')
try:
input_field = self.query_one('#task-input')
input_field = self.query_one('#task-input', Input)
input_field.focus()
logger.debug('Input field focused')
except Exception as e:
@@ -550,8 +550,9 @@ class BrowserUseApp(App):
def on_input_key_up(self, event: events.Key) -> None:
"""Handle up arrow key in the input field."""
# Check if event is from the input field
if event.sender.id != 'task-input':
# For textual key events, we need to check focus manually
input_field = self.query_one('#task-input', Input)
if not input_field.has_focus:
return
# Only process if we have history
@@ -561,9 +562,10 @@ class BrowserUseApp(App):
# Move back in history if possible
if self.history_index > 0:
self.history_index -= 1
self.query_one('#task-input').value = self.task_history[self.history_index]
task_input = self.query_one('#task-input', Input)
task_input.value = self.task_history[self.history_index]
# Move cursor to end of text
self.query_one('#task-input').cursor_position = len(self.query_one('#task-input').value)
task_input.cursor_position = len(task_input.value)
# Prevent default behavior (cursor movement)
event.prevent_default()
@@ -571,8 +573,9 @@ class BrowserUseApp(App):
def on_input_key_down(self, event: events.Key) -> None:
"""Handle down arrow key in the input field."""
# Check if event is from the input field
if event.sender.id != 'task-input':
# For textual key events, we need to check focus manually
input_field = self.query_one('#task-input', Input)
if not input_field.has_focus:
return
# Only process if we have history
@@ -582,13 +585,14 @@ class BrowserUseApp(App):
# Move forward in history or clear input if at the end
if self.history_index < len(self.task_history) - 1:
self.history_index += 1
self.query_one('#task-input').value = self.task_history[self.history_index]
task_input = self.query_one('#task-input', Input)
task_input.value = self.task_history[self.history_index]
# Move cursor to end of text
self.query_one('#task-input').cursor_position = len(self.query_one('#task-input').value)
task_input.cursor_position = len(task_input.value)
elif self.history_index == len(self.task_history) - 1:
# At the end of history, go to "new line" state
self.history_index += 1
self.query_one('#task-input').value = ''
self.query_one('#task-input', Input).value = ''
# Prevent default behavior (cursor movement)
event.prevent_default()
@@ -677,7 +681,7 @@ class BrowserUseApp(App):
def update_browser_panel(self) -> None:
"""Update browser information panel with details about the browser."""
browser_info = self.query_one('#browser-info')
browser_info = self.query_one('#browser-info', RichLog)
browser_info.clear()
# Try to use the agent's browser session if available
@@ -772,7 +776,7 @@ class BrowserUseApp(App):
def update_model_panel(self) -> None:
"""Update model information panel with details about the LLM."""
model_info = self.query_one('#model-info')
model_info = self.query_one('#model-info', RichLog)
model_info.clear()
if self.llm:
@@ -810,8 +814,12 @@ class BrowserUseApp(App):
# Get the last step metadata to show the most recent LLM response time
if num_steps > 0 and self.agent.state.history.history[-1].metadata:
last_step = self.agent.state.history.history[-1]
step_duration = last_step.metadata.duration_seconds
step_tokens = last_step.metadata.input_tokens
if last_step.metadata:
step_duration = last_step.metadata.duration_seconds
step_tokens = last_step.metadata.input_tokens
else:
step_duration = 0
step_tokens = 0
if step_tokens > 0:
tokens_per_second = step_tokens / step_duration if step_duration > 0 else 0
@@ -827,7 +835,7 @@ class BrowserUseApp(App):
# Add current state information
if hasattr(self.agent, 'running'):
if self.agent.running:
if getattr(self.agent, 'running', False):
model_info.write('[yellow]LLM is thinking[blink]...[/][/]')
elif hasattr(self.agent, 'state') and hasattr(self.agent.state, 'paused') and self.agent.state.paused:
model_info.write('[orange]LLM paused[/]')
@@ -836,7 +844,7 @@ class BrowserUseApp(App):
def update_tasks_panel(self) -> None:
"""Update tasks information panel with details about the tasks and steps hierarchy."""
tasks_info = self.query_one('#tasks-info')
tasks_info = self.query_one('#tasks-info', RichLog)
tasks_info.clear()
if self.agent:
@@ -942,7 +950,7 @@ class BrowserUseApp(App):
tasks_info.write('')
# If agent is actively running, show a status indicator
if hasattr(self.agent, 'running') and self.agent.running:
if hasattr(self.agent, 'running') and getattr(self.agent, 'running', False):
tasks_info.write('[yellow]Agent is actively working[blink]...[/][/]')
elif hasattr(self.agent, 'state') and hasattr(self.agent.state, 'paused') and self.agent.state.paused:
tasks_info.write('[orange]Agent is paused (press Enter to resume)[/]')
@@ -973,14 +981,16 @@ class BrowserUseApp(App):
self.update_info_panels()
# Clear the log to start fresh
rich_log = self.query_one('#results-log')
rich_log = self.query_one('#results-log', RichLog)
rich_log.clear()
if self.agent is None:
if not self.llm:
raise RuntimeError('LLM not initialized')
self.agent = Agent(
task=task,
llm=self.llm,
controller=self.controller,
controller=self.controller if self.controller else Controller(),
browser_session=self.browser_session,
source='cli',
**agent_settings.model_dump(),
@@ -996,19 +1006,22 @@ class BrowserUseApp(App):
logger.debug('\n🚀 Working on task: %s', task)
# Set flags to indicate the agent is running
self.agent.running = True
self.agent.last_response_time = 0
if self.agent:
self.agent.running = True # type: ignore
self.agent.last_response_time = 0 # type: ignore
# Panel updates are already happening via the timer in update_info_panels
try:
# Run the agent task, redirecting output to RichLog through our handler
await self.agent.run()
if self.agent:
await self.agent.run()
except Exception as e:
logger.error('\nError running agent: %s', str(e))
finally:
# Clear the running flag
self.agent.running = False
if self.agent:
self.agent.running = False # type: ignore
# No need to call update_info_panels() here as it's already updating via timer
@@ -1019,7 +1032,7 @@ class BrowserUseApp(App):
task_input_container.display = True
# Refocus the input field
input_field = self.query_one('#task-input')
input_field = self.query_one('#task-input', Input)
input_field.focus()
# Ensure the input is visible by scrolling to it
@@ -1031,7 +1044,7 @@ class BrowserUseApp(App):
def action_input_history_prev(self) -> None:
"""Navigate to the previous item in command history."""
# Only process if we have history and input is focused
input_field = self.query_one('#task-input')
input_field = self.query_one('#task-input', Input)
if not input_field.has_focus or not self.task_history:
return
@@ -1045,7 +1058,7 @@ class BrowserUseApp(App):
def action_input_history_next(self) -> None:
"""Navigate to the next item in command history or clear input."""
# Only process if we have history and input is focused
input_field = self.query_one('#task-input')
input_field = self.query_one('#task-input', Input)
if not input_field.has_focus or not self.task_history:
return
@@ -1131,7 +1144,7 @@ class BrowserUseApp(App):
# Paths panel
yield Static(
f' ⚙️ Settings & history saved to: {str(USER_CONFIG_FILE.resolve()).replace(str(Path.home()), "~")}\n'
f' ⚙️ Settings & history saved to: {str(CONFIG.BROWSER_USE_CONFIG_FILE.resolve()).replace(str(Path.home()), "~")}\n'
f' 📁 Outputs & recordings saved to: {str(Path(".").resolve()).replace(str(Path.home()), "~")}',
id='paths-panel',
markup=True,
@@ -1176,10 +1189,10 @@ async def run_prompt_mode(prompt: str, ctx: click.Context, debug: bool = False):
# Create browser session with config parameters
browser_config = config.get('browser', {})
# Create BrowserProfile with user_data_dir
profile = BrowserProfile(user_data_dir=str(USER_DATA_DIR), **browser_config)
browser_session = BrowserSession(
stealth=True,
user_data_dir=USER_DATA_DIR,
**browser_config,
browser_profile=profile,
)
# Create and run agent
@@ -1239,19 +1252,17 @@ async def textual_interface(config: dict[str, Any]):
logger.info('Browser mode: visible')
# Create BrowserSession directly with config parameters
# Create BrowserProfile with user_data_dir
profile = BrowserProfile(user_data_dir=str(USER_DATA_DIR), **browser_config)
browser_session = BrowserSession(
stealth=True,
user_data_dir=USER_DATA_DIR,
**browser_config,
browser_profile=profile,
)
logger.debug('BrowserSession initialized successfully')
# Log browser version if available
try:
if hasattr(browser_session, 'version') and browser_session.version:
logger.info(f'Browser version: {browser_session.version}')
elif hasattr(browser_session, 'playwright_browser') and browser_session.playwright_browser:
version = browser_session.playwright_browser.version
if hasattr(browser_session, 'browser') and browser_session.browser:
version = browser_session.browser.version
logger.info(f'Browser version: {version}')
except Exception as e:
logger.debug(f'Could not determine browser version: {e}')
@@ -1375,7 +1386,7 @@ def main(ctx: click.Context, debug: bool = False, **kwargs):
logger.debug('Loading user configuration...')
try:
config = load_user_config()
logger.debug(f'User configuration loaded from {USER_CONFIG_FILE}')
logger.debug(f'User configuration loaded from {CONFIG.BROWSER_USE_CONFIG_FILE}')
except Exception as e:
logger.error(f'Error loading user configuration: {str(e)}', exc_info=True)
print(f'Error loading configuration: {str(e)}')

161
browser_use/config.py Normal file
View File

@@ -0,0 +1,161 @@
"""Lazy-loading configuration system for browser-use environment variables."""
import os
from functools import cache
from pathlib import Path
import psutil
@cache
def is_running_in_docker() -> bool:
"""Detect if we are running in a docker container, for the purpose of optimizing chrome launch flags (dev shm usage, gpu settings, etc.)"""
try:
if Path('/.dockerenv').exists() or 'docker' in Path('/proc/1/cgroup').read_text().lower():
return True
except Exception:
pass
try:
# if init proc (PID 1) looks like uvicorn/python/uv/etc. then we're in Docker
# if init proc (PID 1) looks like bash/systemd/init/etc. then we're probably NOT in Docker
init_cmd = ' '.join(psutil.Process(1).cmdline())
if ('py' in init_cmd) or ('uv' in init_cmd) or ('app' in init_cmd):
return True
except Exception:
pass
try:
# if less than 10 total running procs, then we're almost certainly in a container
if len(psutil.pids()) < 10:
return True
except Exception:
pass
return False
class Config:
"""Lazy-loading configuration class for environment variables (env vars can change at runtime so we need to get them fresh on every access)"""
# Cache for directory creation tracking
_dirs_created = False
@property
def BROWSER_USE_LOGGING_LEVEL(self) -> str:
return os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
@property
def ANONYMIZED_TELEMETRY(self) -> bool:
return os.getenv('ANONYMIZED_TELEMETRY', 'true').lower()[:1] in 'ty1'
@property
def BROWSER_USE_CLOUD_SYNC(self) -> bool:
return os.getenv('BROWSER_USE_CLOUD_SYNC', str(self.ANONYMIZED_TELEMETRY)).lower()[:1] in 'ty1'
@property
def BROWSER_USE_CLOUD_API_URL(self) -> str:
url = os.getenv('BROWSER_USE_CLOUD_API_URL', 'https://api.browser-use.com')
assert '://' in url, 'BROWSER_USE_CLOUD_API_URL must be a valid URL'
return url
@property
def BROWSER_USE_CLOUD_UI_URL(self) -> str:
url = os.getenv('BROWSER_USE_CLOUD_UI_URL', '')
# Allow empty string as default, only validate if set
if url and '://' not in url:
raise AssertionError('BROWSER_USE_CLOUD_UI_URL must be a valid URL if set')
return url
# Path configuration
@property
def XDG_CACHE_HOME(self) -> Path:
return Path(os.getenv('XDG_CACHE_HOME', '~/.cache')).expanduser().resolve()
@property
def XDG_CONFIG_HOME(self) -> Path:
return Path(os.getenv('XDG_CONFIG_HOME', '~/.config')).expanduser().resolve()
@property
def BROWSER_USE_CONFIG_DIR(self) -> Path:
path = Path(os.getenv('BROWSER_USE_CONFIG_DIR', str(self.XDG_CONFIG_HOME / 'browseruse'))).expanduser().resolve()
self._ensure_dirs()
return path
@property
def BROWSER_USE_CONFIG_FILE(self) -> Path:
return self.BROWSER_USE_CONFIG_DIR / 'config.json'
@property
def BROWSER_USE_PROFILES_DIR(self) -> Path:
path = self.BROWSER_USE_CONFIG_DIR / 'profiles'
self._ensure_dirs()
return path
@property
def BROWSER_USE_DEFAULT_USER_DATA_DIR(self) -> Path:
return self.BROWSER_USE_PROFILES_DIR / 'default'
def _ensure_dirs(self) -> None:
"""Create directories if they don't exist (only once)"""
if not self._dirs_created:
config_dir = (
Path(os.getenv('BROWSER_USE_CONFIG_DIR', str(self.XDG_CONFIG_HOME / 'browseruse'))).expanduser().resolve()
)
config_dir.mkdir(parents=True, exist_ok=True)
(config_dir / 'profiles').mkdir(parents=True, exist_ok=True)
self._dirs_created = True
# LLM API key configuration
@property
def OPENAI_API_KEY(self) -> str:
return os.getenv('OPENAI_API_KEY', '')
@property
def ANTHROPIC_API_KEY(self) -> str:
return os.getenv('ANTHROPIC_API_KEY', '')
@property
def GOOGLE_API_KEY(self) -> str:
return os.getenv('GOOGLE_API_KEY', '')
@property
def DEEPSEEK_API_KEY(self) -> str:
return os.getenv('DEEPSEEK_API_KEY', '')
@property
def GROK_API_KEY(self) -> str:
return os.getenv('GROK_API_KEY', '')
@property
def NOVITA_API_KEY(self) -> str:
return os.getenv('NOVITA_API_KEY', '')
@property
def AZURE_OPENAI_ENDPOINT(self) -> str:
return os.getenv('AZURE_OPENAI_ENDPOINT', '')
@property
def AZURE_OPENAI_KEY(self) -> str:
return os.getenv('AZURE_OPENAI_KEY', '')
@property
def SKIP_LLM_API_KEY_VERIFICATION(self) -> bool:
return os.getenv('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[:1] in 'ty1'
# Runtime hints
@property
def IN_DOCKER(self) -> bool:
return os.getenv('IN_DOCKER', 'false').lower()[:1] in 'ty1' or is_running_in_docker()
@property
def IS_IN_EVALS(self) -> bool:
return os.getenv('IS_IN_EVALS', 'false').lower()[:1] in 'ty1'
@property
def WIN_FONT_DIR(self) -> str:
return os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts')
# Create a singleton instance
CONFIG = Config()

View File

@@ -203,6 +203,10 @@ class Registry(Generic[Context]):
raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
elif param.name == 'page':
raise ValueError(f'Action {func.__name__} requires page but none provided.')
elif param.name == 'available_file_paths':
raise ValueError(f'Action {func.__name__} requires available_file_paths but none provided.')
elif param.name == 'file_system':
raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
else:
raise ValueError(f"{func.__name__}() missing required special parameter '{param.name}'")
call_args.append(value)
@@ -218,6 +222,10 @@ class Registry(Generic[Context]):
raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
elif param.name == 'page':
raise ValueError(f'Action {func.__name__} requires page but none provided.')
elif param.name == 'available_file_paths':
raise ValueError(f'Action {func.__name__} requires available_file_paths but none provided.')
elif param.name == 'file_system':
raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
else:
raise ValueError(f"{func.__name__}() missing required special parameter '{param.name}'")
else:

View File

@@ -1,5 +1,5 @@
from collections.abc import Callable
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from langchain_core.language_models.chat_models import BaseChatModel
from pydantic import BaseModel, ConfigDict
@@ -9,7 +9,7 @@ from browser_use.browser.types import Page
from browser_use.filesystem.file_system import FileSystem
if TYPE_CHECKING:
from browser_use.agent.service import Context
pass
class RegisteredAction(BaseModel):
@@ -153,7 +153,7 @@ class SpecialActionParameters(BaseModel):
# e.g. can contain anything, external db connections, file handles, queues, runtime config objects, etc.
# that you might want to be able to access quickly from within many of your actions
# browser-use code doesn't use this at all, we just pass it down to your actions for convenience
context: 'Context | None' = None
context: Any | None = None
# browser-use session object, can be used to create new tabs, navigate, access playwright objects, etc.
browser_session: BrowserSession | None = None

View File

@@ -174,9 +174,10 @@ class Controller(Generic[Context]):
# SECURITY FIX: Use browser_session.navigate_to() instead of direct page.goto()
# This ensures URL validation against allowed_domains is performed
await browser_session.navigate_to(params.url)
msg = f'🔗 Navigated to {params.url}'
memory = f'Navigated to {params.url}'
msg = f'🔗 {memory}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=memory)
except Exception as e:
error_msg = str(e)
# Check for network-related errors
@@ -239,7 +240,7 @@ class Controller(Generic[Context]):
initial_pages = len(browser_session.tabs)
# if element has file uploader then dont click
if await browser_session.find_file_upload_element_by_index(params.index) is not None:
if await browser_session.is_file_input_by_index(params.index):
msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True, success=False, long_term_memory=msg)
@@ -278,7 +279,7 @@ class Controller(Generic[Context]):
return ActionResult(error=error_msg, success=False)
@self.registry.action(
'Input text into a input interactive element',
'Click and input text into a input interactive element',
param_model=InputTextAction,
)
async def input_text(params: InputTextAction, browser_session: BrowserSession, has_sensitive_data: bool = False):
@@ -287,7 +288,12 @@ class Controller(Generic[Context]):
element_node = await browser_session.get_dom_element_by_index(params.index)
assert element_node is not None, f'Element with index {params.index} does not exist'
await browser_session._input_text_element_node(element_node, params.text)
try:
await browser_session._input_text_element_node(element_node, params.text)
except Exception:
msg = f'Failed to input text into element {params.index}.'
return ActionResult(error=msg)
if not has_sensitive_data:
msg = f'⌨️ Input {params.text} into index {params.index}'
else:
@@ -367,6 +373,7 @@ Only use this for extracting info from a single product/article page, not for en
query: str,
page: Page,
page_extraction_llm: BaseChatModel,
file_system: FileSystem,
):
from functools import partial
@@ -434,13 +441,24 @@ Explain the content of the page and that the requested information is not availa
output = await page_extraction_llm.ainvoke(template.format(query=query, page=content))
output_text = output.content
extracted_content = f'Page Link: {page.url}\nQuery: {query}\nExtracted Content:\n{output_text}'
# if content is small include it to memory
if len(extracted_content) < 1000:
MAX_MEMORY_SIZE = 600
if len(extracted_content) < MAX_MEMORY_SIZE:
memory = extracted_content
include_extracted_content_only_once = False
else:
memory = f'Extracted content from {page.url} for query "{query}"'
# find lines until MAX_MEMORY_SIZE
lines = extracted_content.splitlines()
display = ''
display_lines_count = 0
for line in lines:
if len(display) + len(line) < MAX_MEMORY_SIZE:
display += line + '\n'
display_lines_count += 1
else:
break
save_result = await file_system.save_extracted_content(extracted_content)
memory = f'Extracted content from {page.url}\n<query>{query}\n</query>\n<extracted_content>\n{display}{len(lines) - display_lines_count} more lines...\n</extracted_content>\n<file_system>{save_result}</file_system>'
include_extracted_content_only_once = True
logger.info(f'📄 {memory}')
return ActionResult(
@@ -502,7 +520,7 @@ Explain the content of the page and that the requested information is not availa
dy = dy_result
try:
await browser_session._scroll_container(dy)
await browser_session._scroll_container(cast(int, dy))
except Exception as e:
# Hard fallback: always works on root scroller
await page.evaluate('(y) => window.scrollBy(0, y)', dy)
@@ -530,7 +548,7 @@ Explain the content of the page and that the requested information is not availa
)
if action_result:
return action_result
dy = -(dy_result)
dy = -(dy_result or 0)
try:
await browser_session._scroll_container(dy)
@@ -615,6 +633,50 @@ Explain the content of the page and that the requested information is not availa
logger.error(msg)
return ActionResult(error=msg, include_in_memory=True)
# File System Actions
@self.registry.action('Write content to file_name in file system, use only .md or .txt extensions.')
async def write_file(file_name: str, content: str, file_system: FileSystem):
result = await file_system.write_file(file_name, content)
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
@self.registry.action('Append content to file_name in file system')
async def append_file(file_name: str, content: str, file_system: FileSystem):
result = await file_system.append_file(file_name, content)
logger.info(f'💾 {result}')
return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
@self.registry.action('Read file_name from file system')
async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem):
if available_file_paths and file_name in available_file_paths:
import anyio
async with await anyio.open_file(file_name, 'r') as f:
content = await f.read()
result = f'Read from file {file_name}.\n<content>\n{content}\n</content>'
else:
result = await file_system.read_file(file_name)
MAX_MEMORY_SIZE = 1000
if len(result) > MAX_MEMORY_SIZE:
lines = result.splitlines()
display = ''
for line in lines:
if len(display) + len(line) < MAX_MEMORY_SIZE:
display += line + '\n'
else:
break
memory = f'{display}{len(lines) - len(display)} more lines...'
else:
memory = result
logger.info(f'💾 {memory}')
return ActionResult(
extracted_content=result,
include_in_memory=True,
long_term_memory=memory,
include_extracted_content_only_once=True,
)
@self.registry.action(
description='Get all options from a native dropdown',
)

View File

@@ -4,11 +4,10 @@
focusHighlightIndex: -1,
viewportExpansion: 0,
debugMode: false,
initialIndex: 0,
}
) => {
const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode, initialIndex } = args;
let highlightIndex = initialIndex; // Reset highlight index
const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode } = args;
let highlightIndex = 0; // Reset highlight index
// Add timing stack to handle recursion
const TIMING_STACK = {
@@ -211,7 +210,7 @@
*/
const DOM_HASH_MAP = {};
const ID = { current: initialIndex };
const ID = { current: 0 };
const HIGHLIGHT_CONTAINER_ID = "playwright-highlight-container";
@@ -837,7 +836,7 @@
}
}
const getEventListenersForNode = window.getEventListenersForNode;
const getEventListenersForNode = element?.ownerDocument?.defaultView?.getEventListenersForNode || window.getEventListenersForNode;
if (typeof getEventListenersForNode === 'function') {
const listeners = getEventListenersForNode(element);
const interactionEvents = ['click', 'mousedown', 'mouseup', 'keydown', 'keyup', 'submit', 'change', 'input', 'focus', 'blur'];
@@ -1130,7 +1129,7 @@
// Check for other common interaction event listeners
try {
const getEventListenersForNode = window.getEventListenersForNode;
const getEventListenersForNode = element?.ownerDocument?.defaultView?.getEventListenersForNode || window.getEventListenersForNode;
if (typeof getEventListenersForNode === 'function') {
const listeners = getEventListenersForNode(element);
const interactionEvents = ['click', 'mousedown', 'mouseup', 'keydown', 'keyup', 'submit', 'change', 'input', 'focus', 'blur'];
@@ -1355,7 +1354,6 @@
if (domElement) nodeData.children.push(domElement);
}
}
nodeData.hasIframeContent = true;
} catch (e) {
console.warn("Unable to access iframe:", e);
}

View File

@@ -6,7 +6,6 @@ from urllib.parse import urlparse
if TYPE_CHECKING:
from browser_use.browser.types import Page
from dataclasses import dataclass
from browser_use.dom.views import (
DOMBaseNode,
@@ -24,42 +23,6 @@ from browser_use.utils import time_execution_async
# height: int
@dataclass
class PageFrameEvaluationResult:
url: str
result: dict
name: str | None = None
id: str | None = None
@property
def known_frame_urls(self) -> list[str]:
return [
v.get('attributes', {}).get('src')
for v in self.map.values()
if v.get('hasIframeContent') and v.get('attributes', {}).get('src')
]
@property
def map(self) -> dict:
return self.result.get('map', {})
@property
def map_size(self) -> int:
return len(self.map)
@property
def perf_metrics(self) -> dict:
return self.result.get('perfMetrics', {})
@property
def short_url(self) -> str:
return self.url[:50] + '...' if len(self.url) > 50 else self.url
@property
def root_id(self) -> str | None:
return self.result.get('rootId')
class DomService:
logger: logging.Logger
@@ -132,160 +95,73 @@ class DomService:
'focusHighlightIndex': focus_element,
'viewportExpansion': viewport_expansion,
'debugMode': debug_mode,
'initialIndex': 0,
}
try:
eval_page: dict = await self.page.evaluate(self.js_code, args)
page_eval_result = PageFrameEvaluationResult(
url=self.page.url,
result=eval_page,
)
except Exception as e:
self.logger.error('Error evaluating JavaScript: %s', e)
raise
frames = [page_eval_result]
total_map_size = page_eval_result.map_size
known_frame_urls = page_eval_result.known_frame_urls
# TODO: only look in iframes from enabled_domains
for iframe in self.page.frames:
if (
iframe.url
and iframe.url != self.page.url
and not iframe.url.startswith('data:')
and iframe.url not in known_frame_urls
):
try:
frame_element = await iframe.frame_element()
except Exception as e:
self.logger.error('Error getting frame element for iframe %s: %s', iframe.url, e)
continue
if not await frame_element.is_visible():
continue
args['initialIndex'] = total_map_size # continue indexing from the last index
try:
name = await frame_element.get_attribute('name')
id = await frame_element.get_attribute('id')
iframe_eval_result = await iframe.evaluate(self.js_code, args)
frame = PageFrameEvaluationResult(
url=iframe.url,
result=iframe_eval_result,
name=name,
id=id,
)
frames.append(frame)
known_frame_urls.append(iframe.url)
known_frame_urls.extend(frame.known_frame_urls)
total_map_size += frame.map_size
except Exception as e:
self.logger.error('Error evaluating JavaScript in iframe %s: %s', iframe.url, e)
continue
# Only log performance metrics in debug mode
if debug_mode and len(frames) > 1:
for index, frame in enumerate(frames):
perf = frame.perf_metrics
if perf:
# Get key metrics for summary
total_nodes = perf.get('nodeMetrics', {}).get('totalNodes', 0)
# processed_nodes = perf.get('nodeMetrics', {}).get('processedNodes', 0)
if debug_mode and 'perfMetrics' in eval_page:
perf = eval_page['perfMetrics']
# Count interactive elements from the DOM map
interactive_count = 0
for node_data in frame.map.values():
if isinstance(node_data, dict) and node_data.get('isInteractive'):
interactive_count += 1
# Get key metrics for summary
total_nodes = perf.get('nodeMetrics', {}).get('totalNodes', 0)
# processed_nodes = perf.get('nodeMetrics', {}).get('processedNodes', 0)
# Create concise summary
self.logger.debug(
f'🔎 Ran buildDOMTree.js interactive element detection on{" iframe" if index > 0 else ""}: %s interactive=%d/%d\n',
frame.short_url,
interactive_count,
total_nodes,
# processed_nodes,
)
# Count interactive elements from the DOM map
interactive_count = 0
if 'map' in eval_page:
for node_data in eval_page['map'].values():
if isinstance(node_data, dict) and node_data.get('isInteractive'):
interactive_count += 1
return await self._construct_dom_tree(frames)
# Create concise summary
url_short = self.page.url[:50] + '...' if len(self.page.url) > 50 else self.page.url
self.logger.debug(
'🔎 Ran buildDOMTree.js interactive element detection on: %s interactive=%d/%d\n',
url_short,
interactive_count,
total_nodes,
# processed_nodes,
)
return await self._construct_dom_tree(eval_page)
@time_execution_async('--construct_dom_tree')
async def _construct_dom_tree(
self,
frames: list[PageFrameEvaluationResult],
eval_page: dict,
) -> tuple[DOMElementNode, SelectorMap]:
# The first page in eval_pages is the main page, and it contains the rootId
js_root_id = frames[0].root_id
if js_root_id is None:
raise ValueError('No rootId found in the evaluated page structure')
js_node_map = eval_page['map']
js_root_id = eval_page['rootId']
selector_map: SelectorMap = {}
node_map: dict[str, DOMBaseNode] = {}
selector_map = {}
node_map = {}
for frame in frames:
js_node_map = frame.map
for id, node_data in js_node_map.items():
node, children_ids = self._parse_node(node_data)
if node is None:
continue
for id, node_data in js_node_map.items():
node, children_ids = self._parse_node(node_data)
if node is None:
continue
node_map[id] = node
node_map[id] = node
if isinstance(node, DOMElementNode) and node.highlight_index is not None:
selector_map[node.highlight_index] = node
if isinstance(node, DOMElementNode) and node.highlight_index is not None:
selector_map[node.highlight_index] = node
# NOTE: We know that we are building the tree bottom up
# and all children are already processed.
if isinstance(node, DOMElementNode):
for child_id in children_ids:
if child_id not in node_map:
continue
child_node = node_map[child_id]
child_node.parent = node
node.children.append(child_node)
# For each child iframe, we need to set the parent of the root element to the iframe element.
for frame in frames[1:]:
content_root_node = node_map.get(frame.root_id)
if content_root_node:
# Find the iframe element in the main page
iframe_element_node = next(
(
node
for node in node_map.values()
if isinstance(node, DOMElementNode)
and node.is_iframe_element(url=frame.url, name=frame.name, id=frame.id)
),
None,
)
if iframe_element_node:
if not iframe_element_node.children:
iframe_element_node.children = [content_root_node]
content_root_node.parent = iframe_element_node
# NOTE: We know that we are building the tree bottom up
# and all children are already processed.
if isinstance(node, DOMElementNode):
for child_id in children_ids:
if child_id not in node_map:
continue
else:
self.logger.warning(
'Iframe element %s already has children, skipping',
frame.short_url,
)
else:
self.logger.warning(
'Could not find iframe element for %s in the main page DOM',
frame.short_url,
)
# If we could not find the iframe element, remove the frame's nodes from the maps.
for id in frame.map.keys():
node = node_map.get(id)
# Remove the node from the selector map if it has a highlight index
if isinstance(node, DOMElementNode) and node.highlight_index is not None and node.highlight_index in selector_map:
del selector_map[node.highlight_index]
child_node = node_map[child_id]
del node_map[id]
child_node.parent = node
node.children.append(child_node)
html_to_dict = node_map[str(js_root_id)]

View File

@@ -223,21 +223,13 @@ class DOMElementNode(DOMBaseNode):
elif isinstance(node, DOMTextNode):
# Add text only if it doesn't have a highlighted parent
if (
node.parent.highlight_index is None and node.parent and node.parent.is_visible and node.parent.is_top_element
node.parent and node.parent.highlight_index is None and node.parent.is_visible and node.parent.is_top_element
): # and node.is_parent_top_element()
formatted_text.append(f'{depth_str}{node.text}')
process_node(self, 0)
return '\n'.join(formatted_text)
def is_iframe_element(self, url: str, name: str | None = None, id: str | None = None) -> bool:
return (
self.tag_name.lower() == 'iframe'
and self.attributes.get('src') == url
and (name is None or self.attributes.get('name') == name)
and (id is None or self.attributes.get('id') == id)
)
SelectorMap = dict[int, DOMElementNode]

View File

@@ -25,10 +25,17 @@ class FileSystem:
self.todo_file = self.dir / 'todo.md'
self.results_file.touch(exist_ok=True)
self.todo_file.touch(exist_ok=True)
self.extracted_content_count = 0
def get_dir(self) -> Path:
return self.dir
async def save_extracted_content(self, content: str) -> str:
extracted_content_file_name = f'extracted_content_{self.extracted_content_count}.md'
result = await self.write_file(extracted_content_file_name, content)
self.extracted_content_count += 1
return result
def _is_valid_filename(self, file_name: str) -> bool:
"""Check if filename matches the required pattern: name.extension"""
pattern = r'^[a-zA-Z0-9_\-]+\.(txt|md)$'
@@ -59,7 +66,7 @@ class FileSystem:
with ThreadPoolExecutor() as executor:
# Run file read in a thread to avoid blocking
content = await asyncio.get_event_loop().run_in_executor(executor, lambda: path.read_text())
return f'Read from file {file_name}:\n{content}'
return f'Read from file {file_name}.\n<content>\n{content}\n</content>'
except Exception:
return f"Error: Could not read file '{file_name}'."
@@ -98,17 +105,88 @@ class FileSystem:
return f"Error: Could not append to file '{file_name}'. {str(e)}"
def describe(self) -> str:
"""List all files with their line counts."""
description = ''
for f in self.dir.iterdir():
if f.is_file():
try:
num_lines = len(f.read_text().splitlines())
description += f'- {f.name}{num_lines} lines\n'
except Exception:
description += f'- {f.name} — [error reading file]\n'
"""List all files with their content information.
return description
Example output:
<file>
results.md - 42 lines
<content>
{preview_start}
... {n_lines} more lines ...
{preview_end}
</content>
</file>
"""
DISPLAY_CHARS = 400 # Total characters to display (split between start and end)
description = ''
for f in self.dir.iterdir():
# Only process files and skip todo.md
if (not f.is_file()) or f.name == 'todo.md':
continue
try:
content = f.read_text()
# Handle empty files
if not content:
description += f'<file>\n{f.name} - [empty file]\n</file>\n\n'
continue
lines = content.splitlines()
line_count = len(lines)
# For small files, display the entire content
whole_file_description = f'<file>\n{f.name} - {line_count} lines\n<content>\n{content}\n</content>\n</file>\n'
if len(content) < int(1.5 * DISPLAY_CHARS):
description += whole_file_description
continue
# For larger files, display start and end previews
half_display_chars = DISPLAY_CHARS // 2
# Get start preview
start_preview = ''
start_line_count = 0
chars_count = 0
for line in lines:
if chars_count + len(line) + 1 > half_display_chars:
break
start_preview += line + '\n'
chars_count += len(line) + 1
start_line_count += 1
# Get end preview
end_preview = ''
end_line_count = 0
chars_count = 0
for line in reversed(lines):
if chars_count + len(line) + 1 > half_display_chars:
break
end_preview = line + '\n' + end_preview
chars_count += len(line) + 1
end_line_count += 1
# Calculate lines in between
middle_line_count = line_count - start_line_count - end_line_count
if middle_line_count <= 0:
# display the entire file
description += whole_file_description
continue
start_preview = start_preview.strip('\n').rstrip()
end_preview = end_preview.strip('\n').rstrip()
# Format output
description += f'<file>\n{f.name} - {line_count} lines\n<content>\n{start_preview}\n'
description += f'... {middle_line_count} more lines ...\n'
description += f'{end_preview}\n'
description += '</content>\n</file>\n'
except Exception:
description += f'<file>\n{f.name} - [error reading file]\n</file>\n\n'
return description.strip('\n')
def get_todo_contents(self) -> str:
return self.todo_file.read_text()

View File

@@ -1,11 +1,12 @@
import logging
import os
import sys
from dotenv import load_dotenv
load_dotenv()
from browser_use.config import CONFIG
def addLoggingLevel(levelName, levelNum, methodName=None):
"""
@@ -65,7 +66,7 @@ def setup_logging():
except AttributeError:
pass # Level already exists, which is fine
log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
log_type = CONFIG.BROWSER_USE_LOGGING_LEVEL
# Check if handlers are already set up
if logging.getLogger().hasHandlers():

View File

@@ -11,6 +11,8 @@ from datetime import datetime
import httpx
from pydantic import BaseModel
from browser_use.config import CONFIG
# Temporary user ID for pre-auth events (matches cloud backend)
TEMP_USER_ID = '99999999-9999-9999-9999-999999999999'
@@ -25,9 +27,8 @@ class CloudAuthConfig(BaseModel):
@classmethod
def load_from_file(cls) -> 'CloudAuthConfig':
"""Load auth config from local file"""
from browser_use.utils import BROWSER_USE_CONFIG_DIR
config_path = BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
config_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
if config_path.exists():
try:
with open(config_path) as f:
@@ -40,11 +41,10 @@ class CloudAuthConfig(BaseModel):
def save_to_file(self) -> None:
"""Save auth config to local file"""
from browser_use.utils import BROWSER_USE_CONFIG_DIR
BROWSER_USE_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
CONFIG.BROWSER_USE_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
config_path = BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
config_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
with open(config_path, 'w') as f:
json.dump(self.model_dump(mode='json'), f, indent=2, default=str)
@@ -61,7 +61,7 @@ class DeviceAuthClient:
def __init__(self, base_url: str | None = None, http_client: httpx.AsyncClient | None = None):
# Backend API URL for OAuth requests - can be passed directly or defaults to env var
self.base_url = base_url or os.getenv('BROWSER_USE_CLOUD_URL', 'https://cloud.browser-use.com')
self.base_url = base_url or CONFIG.BROWSER_USE_CLOUD_API_URL
self.client_id = 'library'
self.scope = 'read write'
@@ -124,8 +124,8 @@ class DeviceAuthClient:
async def poll_for_token(
self,
device_code: str,
interval: int = 5,
timeout: int = 1800,
interval: float = 3.0,
timeout: float = 1800.0,
) -> dict | None:
"""
Poll for the access token.
@@ -257,7 +257,7 @@ class DeviceAuthClient:
device_auth = await self.start_device_authorization(agent_session_id)
# Use frontend URL for user-facing links
frontend_url = os.getenv('BROWSER_USE_CLOUD_UI_URL', self.base_url)
frontend_url = CONFIG.BROWSER_USE_CLOUD_UI_URL or self.base_url.replace('//api.', '//cloud.')
# Replace backend URL with frontend URL in verification URIs
verification_uri = device_auth['verification_uri'].replace(self.base_url, frontend_url)
@@ -290,9 +290,13 @@ class DeviceAuthClient:
except Exception as e:
# Log the error details for debugging
if hasattr(e, 'response'):
logger.debug(
f'Failed to get pre-auth token for cloud sync: HTTP {e.response.status_code} - {e.response.text[:200]}'
)
response = getattr(e, 'response')
if hasattr(response, 'status_code') and hasattr(response, 'text'):
logger.debug(
f'Failed to get pre-auth token for cloud sync: HTTP {response.request.url} {response.status_code} - {response.text}'
)
else:
logger.debug(f'Failed to get pre-auth token for cloud sync: {type(e).__name__}: {e}')
else:
logger.debug(f'Failed to get pre-auth token for cloud sync: {type(e).__name__}: {e}')

View File

@@ -5,12 +5,12 @@ Cloud sync service for sending events to the Browser Use cloud.
import asyncio
import json
import logging
import os
import anyio
import httpx
from bubus import BaseEvent
from browser_use.config import CONFIG
from browser_use.sync.auth import TEMP_USER_ID, DeviceAuthClient
logger = logging.getLogger(__name__)
@@ -21,10 +21,10 @@ class CloudSync:
def __init__(self, base_url: str | None = None, enable_auth: bool = True):
# Backend API URL for all API requests - can be passed directly or defaults to env var
self.base_url = base_url or os.getenv('BROWSER_USE_CLOUD_URL', 'https://cloud.browser-use.com')
self.base_url = base_url or CONFIG.BROWSER_USE_CLOUD_API_URL
self.enable_auth = enable_auth
self.auth_client = DeviceAuthClient(base_url=self.base_url) if enable_auth else None
self.pending_events: list[dict] = []
self.pending_events: list[BaseEvent] = []
self.auth_task = None
self.session_id: str | None = None
@@ -33,42 +33,30 @@ class CloudSync:
try:
# Extract session ID from CreateAgentSessionEvent
if event.event_type == 'CreateAgentSession' and hasattr(event, 'id'):
self.session_id = event.id
self.session_id = str(event.id) # type: ignore
# Start authentication flow if enabled and not authenticated
if self.enable_auth and self.auth_client and not self.auth_client.is_authenticated:
# Start auth in background
self.auth_task = asyncio.create_task(self._background_auth(agent_session_id=self.session_id))
# Prepare event data
event_data = self._prepare_event_data(event)
# Send event to cloud
await self._send_event(event_data)
await self._send_event(event)
except Exception as e:
logger.error(f'Failed to handle {event.event_type} event: {type(e).__name__}: {e}', exc_info=True)
def _prepare_event_data(self, event: BaseEvent) -> dict:
"""Prepare event data for cloud API"""
# Get user_id from auth client or use temp ID
user_id = self.auth_client.user_id if self.auth_client else TEMP_USER_ID
# Set user_id directly on event (mutating the event)
# Use setattr to handle cases where user_id might not be a defined field
if hasattr(event, 'user_id') or hasattr(event, '__dict__'):
event.user_id = user_id
else:
logger.debug(f'Could not set user_id on event type {type(event).__name__}')
# Return event directly as dict
return event.model_dump(mode='json')
async def _send_event(self, event_data: dict) -> None:
async def _send_event(self, event: BaseEvent) -> None:
"""Send event to cloud API"""
try:
headers = {}
# override user_id on event with auth client user_id if available
if self.auth_client:
event.user_id = str(self.auth_client.user_id) # type: ignore
else:
event.user_id = TEMP_USER_ID # type: ignore
# Add auth headers if available
if self.auth_client:
headers.update(self.auth_client.get_headers())
@@ -76,29 +64,33 @@ class CloudSync:
# Send event (batch format with direct BaseEvent serialization)
async with httpx.AsyncClient() as client:
response = await client.post(
f'{self.base_url.rstrip("/")}/api/v1/events/',
json={'events': [event_data]},
f'{self.base_url.rstrip("/")}/api/v1/events',
json={'events': [event.model_dump(mode='json')]},
headers=headers,
timeout=10.0,
)
if response.status_code == 401 and self.auth_client and not self.auth_client.is_authenticated:
# Store event for retry after auth
self.pending_events.append(event_data)
self.pending_events.append(event)
elif response.status_code >= 400:
# Log error but don't raise - we want to fail silently
logger.warning(f'Failed to send event to cloud: HTTP {response.status_code} - {response.text[:200]}')
logger.warning(
f'Failed to send event to cloud: POST {response.request.url} {response.status_code} - {response.text}'
)
except httpx.TimeoutException:
logger.warning(f'Event send timed out after 10 seconds - event_type={event_data.get("event_type", "unknown")}')
logger.warning(f'⚠️ Event send timed out after 10 seconds: {event}')
except httpx.ConnectError as e:
logger.warning(f'Failed to connect to cloud service at {self.base_url}: {e}')
logger.warning(f'⚠️ Failed to connect to cloud service at {self.base_url}: {e}')
except httpx.HTTPError as e:
logger.warning(f'HTTP error sending event: {type(e).__name__}: {e}')
logger.warning(f'⚠️ HTTP error sending event {event}: {type(e).__name__}: {e}')
except Exception as e:
logger.warning(f'Unexpected error sending {event_data.get("event_type", "unknown")} event: {type(e).__name__}: {e}')
logger.warning(f'⚠️ Unexpected error sending event {event}: {type(e).__name__}: {e}')
async def _background_auth(self, agent_session_id: str) -> None:
"""Run authentication in background"""
assert self.auth_client, 'enable_auth=True must be set before calling CloudSync_background_auth()'
assert self.session_id, 'session_id must be set before calling CloudSync._background_auth() can fire'
try:
# Run authentication
success = await self.auth_client.authenticate(
@@ -121,15 +113,10 @@ class CloudSync:
if not self.pending_events:
return
# Update user_id in pending events
user_id = self.auth_client.user_id
for event_data in self.pending_events:
event_data['user_id'] = user_id
# Send all pending events
for event_data in self.pending_events:
for event in self.pending_events:
try:
await self._send_event(event_data)
await self._send_event(event)
except Exception as e:
logger.warning(f'Failed to resend pending event: {e}')
@@ -138,11 +125,13 @@ class CloudSync:
async def _update_wal_user_ids(self, session_id: str) -> None:
"""Update user IDs in WAL file after authentication"""
try:
from browser_use.utils import BROWSER_USE_CONFIG_DIR
assert self.auth_client, 'Cloud sync must be authenticated to update WAL user ID'
wal_path = BROWSER_USE_CONFIG_DIR / 'events' / f'{session_id}.jsonl'
wal_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'events' / f'{session_id}.jsonl'
if not await anyio.Path(wal_path).exists():
return
raise FileNotFoundError(
f'CloudSync failed to update saved event user_ids after auth: Agent EventBus WAL file not found: {wal_path}'
)
# Read all events
events = []

View File

@@ -6,11 +6,11 @@ from dotenv import load_dotenv
from posthog import Posthog
from uuid_extensions import uuid7str
from browser_use.telemetry.views import BaseTelemetryEvent
from browser_use.utils import singleton
load_dotenv()
from browser_use.config import CONFIG
from browser_use.telemetry.views import BaseTelemetryEvent
from browser_use.utils import singleton
logger = logging.getLogger(__name__)
@@ -22,8 +22,7 @@ POSTHOG_EVENT_SETTINGS = {
def xdg_cache_home() -> Path:
default = Path.home() / '.cache'
env_var = os.getenv('XDG_CACHE_HOME')
if env_var and (path := Path(env_var)).is_absolute():
if CONFIG.XDG_CACHE_HOME and (path := Path(CONFIG.XDG_CACHE_HOME)).is_absolute():
return path
return default
@@ -44,8 +43,8 @@ class ProductTelemetry:
_curr_user_id = None
def __init__(self) -> None:
telemetry_disabled = os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug'
telemetry_disabled = not CONFIG.ANONYMIZED_TELEMETRY
self.debug_logging = CONFIG.BROWSER_USE_LOGGING_LEVEL == 'debug'
if telemetry_disabled:
self._posthog_client = None

View File

@@ -12,6 +12,11 @@ from sys import stderr
from typing import Any, ParamSpec, TypeVar
from urllib.parse import urlparse
from dotenv import load_dotenv
load_dotenv()
logger = logging.getLogger(__name__)
# Import error types - these may need to be adjusted based on actual import paths
@@ -21,11 +26,10 @@ except ImportError:
OpenAIBadRequestError = None
try:
from groq import BadRequestError as GroqBadRequestError
from groq import BadRequestError as GroqBadRequestError # type: ignore[import-not-found]
except ImportError:
GroqBadRequestError = None
# Browser Use configuration directory
BROWSER_USE_CONFIG_DIR = Path.home() / '.config' / 'browseruse'
# Global flag to prevent duplicate exit messages
_exiting = False
@@ -539,25 +543,25 @@ def handle_llm_error(e: Exception) -> tuple[dict[str, Any], Any | None]:
Handle LLM API errors and extract failed generation data when available.
Args:
e: The exception that occurred during LLM API call
e: The exception that occurred during LLM API call
Returns:
Tuple containing:
- response: Dict with 'raw' and 'parsed' keys
- parsed: Parsed data (None if extraction was needed)
Tuple containing:
- response: Dict with 'raw' and 'parsed' keys
- parsed: Parsed data (None if extraction was needed)
Raises:
LLMException: If the error is not a recognized type with failed generation data
LLMException: If the error is not a recognized type with failed generation data
"""
# Handle OpenAI BadRequestError with failed_generation
if (
OpenAIBadRequestError
and isinstance(e, OpenAIBadRequestError)
and hasattr(e, 'body')
and e.body
and 'failed_generation' in e.body
and e.body # type: ignore[attr-defined]
and 'failed_generation' in e.body # type: ignore[operator]
):
raw = e.body['failed_generation']
raw = e.body['failed_generation'] # type: ignore[index]
response = {'raw': raw, 'parsed': None}
parsed = None
logger.debug(f'Failed to do tool call, trying to parse raw response: {raw}')
@@ -565,14 +569,16 @@ def handle_llm_error(e: Exception) -> tuple[dict[str, Any], Any | None]:
# Handle Groq BadRequestError with failed_generation
if (
GroqBadRequestError
GroqBadRequestError is not None
and isinstance(e, GroqBadRequestError)
and hasattr(e, 'body')
and e.body
and 'error' in e.body
and 'failed_generation' in e.body['error']
and e.body # type: ignore[attr-defined]
and isinstance(e.body, dict) # type: ignore[attr-defined]
and 'error' in e.body # type: ignore[attr-defined]
and isinstance(e.body['error'], dict) # type: ignore[attr-defined,index]
and 'failed_generation' in e.body['error'] # type: ignore[attr-defined,index]
):
raw = e.body['error']['failed_generation'] # type: ignore
raw = e.body['error']['failed_generation'] # type: ignore[attr-defined,index]
response = {'raw': raw, 'parsed': None}
parsed = None
logger.debug(f'Failed to do tool call, trying to parse raw response: {raw}')
@@ -599,7 +605,7 @@ def get_browser_use_version() -> str:
match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', content)
if match:
version = f'{match.group(1)}'
os.environ['LIBRARY_VERSION'] = version
os.environ['LIBRARY_VERSION'] = version # used by bubus event_schema so all Event schemas include versioning
return version
# If pyproject.toml doesn't exist, try getting version from pip

View File

@@ -30,8 +30,8 @@ agent = Agent('fill out the form on this page', browser_session=browser_session)
## `BrowserSession`
- 🎭 `BrowserSession(**params)` is Browser Use's object that tracks a playwright connection to a running browser. It sets up:
- the `playwright` library, `browser` and/or `browser_context`, and `page` objects and tracks which tabs the agent & human are focused on
- `BrowserSession(**params)` is Browser Use's object that tracks a connection to a running browser. It sets up:
- the `playwright`, `browser`, `browser_context`, and `page` objects and tracks which tabs the agent/human are focused on
- methods to interact with the browser window, apply config needed by the Agent, and run the `DOMService` for element detection
- it can take a `browser_profile=BrowserProfile(...)` template containing some config defaults, and `**kwargs` session-specific config overrides
@@ -271,28 +271,38 @@ Glob patterns are supported:
disable_security: bool = False
```
Completely disables all basic browser security features. Allows interacting across cross-site iFrames boundaries, but
<Warning>
This option is very INSECURE and is only for niche use cases. DO NOT LET YOUR AGENT visit untrusted URLs or give it real cookies when `disable_security=True`.
Visiting a single malicious site in this mode can trivially compromise *all* the cookies in the browser profile in under 1 second.
⚠️ Setting this to `True` is NOT RECOMMENDED.
It completely disables all basic browser security features.
</Warning>
This option is for debugging and interacting across cross-origin iFrames when there are no cookies or sensitive data in use.
It's very INSECURE, under no circumstances should you enable this while using real cookies or sensitive data, visiting a single untrusted URL in this mode can immediately compromise all the profile cookies instantly. Consider a less nuclear option like `bypass_csp=True` instead.
#### `deterministic_rendering`
```python
deterministic_rendering: bool = False
```
Attempt to forced more deterministic rendering for consistent screenshots across different host operating systems and hardware.
Disables OS-specific font hints, aliasing, GPU-accelerated rendering, normalizes DPI, and sets a specific JS random seed to try to avoid nondeterministic JS.
<Warning>
This flag is for niche use cases (e.g. screenshot diffing) where pixel-perfect rendering across different server operating systems is more important than stability.
It makes the agent more likely to be blocked as a bot and triggers some glitchy behavior in chrome occasionally, it's not recommended unless you know you need it.
⚠️ Setting this to `True` is NOT RECOMMENDED.
It can be glitchy & slow, and it increases chances of getting blocked by anti-bot systems. It's mostly useful for QA applications.
</Warning>
It's a shortcut for adding these launch args:
- `--deterministic-mode`
- `--js-flags=--random-seed=1157259159`
- `--force-color-profile=srgb`
- `--font-render-hinting=none`
- `--force-device-scale-factor=2`
- `--enable-webgl`
With these options fonts will look slightly worse than macOS and slightly than Windows, but rendering will be more consistent between OSs and runs. The cost is performance and stability. Software rendering is slower, easier to fingerprint as a bot, and sometimes glitchy. You likely *don't need this option* unless you're trying to do screenshot diffing.
#### `highlight_elements`
```python
@@ -388,7 +398,23 @@ No need to set this unless you have multiple profiles set up in a single `user_d
window_position: dict | None = {"width": 0, "height": 0}
```
Window position from top-left.
Window position from top-left corner.
#### `save_recording_path`
```python
save_recording_path: str | None = None
```
Directory path for saving video recordings.
#### `trace_path`
```python
trace_path: str | None = None
```
Directory path for saving Agent trace files. Files are automatically named as `{trace_path}/{context_id}.zip`.
---
@@ -550,7 +576,7 @@ These control how the browser waits for CDP API calls to complete and pages to l
default_timeout: float | None = None
```
Default timeout for Playwright operations in milliseconds.
Default timeout for Playwright operations in milliseconds (e.g. `10000` if you want 10s).
#### `default_navigation_timeout`
@@ -558,7 +584,7 @@ Default timeout for Playwright operations in milliseconds.
default_navigation_timeout: float | None = None
```
Default timeout for page navigation in milliseconds.
Default timeout for page navigation in milliseconds (e.g. `30000` if you want 30s).
### Playwright Viewport Options
@@ -571,7 +597,7 @@ Configure browser window size, viewport, and display properties:
user_agent: str | None = None
```
Specific user agent to use in this context.
Specific user agent to use in this context. See [`playwright.devices`](https://playwright.dev/python/docs/emulation).
#### `is_mobile`
@@ -603,7 +629,7 @@ Geolocation coordinates. Example: `{"latitude": 59.95, "longitude": 30.31667}`
locale: str | None = None
```
Specify user locale, for example en-GB, de-DE, etc. Locale will affect the navigator.language value, Accept-Language request header value as well as number and date formatting rules.
Specify user locale, for example `en-GB`, `de-DE`, etc. Locale will affect the `navigator.language` value, `Accept-Language` request header value as well as number and date formatting rules.
#### `timezone_id`
@@ -611,7 +637,7 @@ Specify user locale, for example en-GB, de-DE, etc. Locale will affect the navig
timezone_id: str | None = None
```
Timezone identifier (e.g., 'America/New_York').
Timezone identifier (e.g. `'America/New_York'` or `'UTC'`).
#### `window_size`
@@ -646,7 +672,7 @@ A viewport is *always* used in headless mode regardless of this setting, and is
device_scale_factor: float | None = None
```
Device scale factor (DPI). Useful for high-resolution screenshots (set it to 2).
Device scale factor (DPI). Useful for high-resolution screenshots (set it to 2 or 3).
#### `screen`
@@ -743,7 +769,11 @@ Whether to ignore HTTPS errors when sending network requests.
bypass_csp: bool = False
```
Toggles bypassing Content-Security-Policy.
<Warning>
Enabling this can increase security risk and makes the bot very easy to fingerprint. (Cloudflare, Datadome, etc. will block you)
</Warning>
Toggles bypassing Content-Security-Policy. Enabling reduces some CSP-related errors that can arise from automation scripts injected into pages with strict policies that forbid inline scripts.
#### `java_script_enabled`
@@ -751,6 +781,10 @@ Toggles bypassing Content-Security-Policy.
java_script_enabled: bool = True
```
<Warning>
Not recommended, untested with Browser Use and likely breaks things.
</Warning>
Whether or not to enable JavaScript in the context.
#### `service_workers`

564
eval/judge_system.py Normal file
View File

@@ -0,0 +1,564 @@
"""
@file purpose: Comprehensive judge system for evaluating browser-use agent runs with detailed structured feedback.
This system provides multi-dimensional evaluation of agent performance including:
- Task analysis and categorization
- Trajectory quality assessment
- Tool usage effectiveness
- Agent reasoning quality
- Browser handling capabilities
- Structured error categorization
- Actionable improvement suggestions
The judge uses vision-language models to analyze agent execution history, screenshots,
and final results to provide detailed structured JSON feedback for developers.
"""
import asyncio
import base64
import io
import json
import logging
from dataclasses import asdict, dataclass
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any
from langchain_core.language_models.chat_models import BaseChatModel
from PIL import Image
logger = logging.getLogger(__name__)
class ErrorCategory(Enum):
# Access & Authentication
BLOCKED_ACCESS = 'blocked_access'
CAPTCHA_CHALLENGE = 'captcha_challenge'
LOGIN_REQUIRED = 'login_required'
RATE_LIMITED = 'rate_limited'
# Tool & Action Failures
TOOL_MISUSE = 'tool_misuse'
INVALID_PARAMETERS = 'invalid_parameters'
ACTION_SEQUENCE_ERROR = 'action_sequence_error'
# Agent Behavior Issues
INFINITE_LOOP = 'infinite_loop'
STUCK_PATTERN = 'stuck_pattern'
POOR_PLANNING = 'poor_planning'
CONTEXT_LOSS = 'context_loss'
# Browser & Technical
ELEMENT_NOT_FOUND = 'element_not_found'
CLICK_FAILURE = 'click_failure'
LOAD_TIMEOUT = 'load_timeout'
JAVASCRIPT_ERROR = 'javascript_error'
# Content & Understanding
MISUNDERSTOOD_TASK = 'misunderstood_task'
FORMAT_ERROR = 'format_error'
CONTENT_PARSING_ERROR = 'content_parsing_error'
# Enhanced Detection Categories
NAVIGATION_CONFUSION = 'navigation_confusion'
FORM_FILLING_ERROR = 'form_filling_error'
MODAL_HANDLING = 'modal_handling'
IFRAME_ISSUES = 'iframe_issues'
BROWSER_CRASHES = 'browser_crashes'
IMPOSSIBLE_TASK = 'impossible_task'
MISSING_INFORMATION = 'missing_information'
class TaskCategory(Enum):
EXTRACTION = 'extraction'
INTERACTION = 'interaction'
LOGIN = 'login'
RESEARCH = 'research'
SHOPPING = 'shopping'
BOOKING = 'booking'
COMPARISON = 'comparison'
QA_TESTING = 'qa_testing'
FORM_FILLING = 'form_filling'
NAVIGATION = 'navigation'
SEARCH = 'search'
FILTERING = 'filtering'
CONTENT_CREATION = 'content_creation'
FILE_OPERATIONS = 'file_operations'
MULTI_STEP_WORKFLOW = 'multi_step_workflow'
@dataclass
class ScoreBreakdown:
trajectory_quality: int # How human-like is the solution path (1-100)
tool_calling_effectiveness: int # How well do tools work (1-100)
agent_reasoning: int # Quality of agent's decision making (1-100)
browser_handling: int # Browser stability and error handling (1-100)
task_satisfaction: int # Final user satisfaction (1-100)
@dataclass
class JudgeResult:
# Basic Information
task_summary: str # 1 sentence summary
task_clarity_score: int # How clear vs uncertain the task is (1-100)
task_categories: list[TaskCategory] # Primary task categories
# Analysis
reasoning: str # What went well/not well analysis
error_categories: list[ErrorCategory] # Core error categories identified
# Scores
scores: ScoreBreakdown
final_score: int # Overall score (1-100)
passed: bool # Whether it meets 70% threshold
# Developer Feedback
improvement_tips: list[str] # Concrete improvement suggestions
critical_issues: list[str] # Must-fix issues
# Metadata
evaluation_timestamp: str
def encode_image(image_path: str) -> str:
"""Convert image file to base64 string."""
try:
with Image.open(image_path) as image:
if image.mode == 'RGBA':
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='JPEG')
return base64.b64encode(buffered.getvalue()).decode('utf-8')
except Exception as e:
logger.error(f'Failed to encode image {image_path}: {e}')
return ''
def truncate_text(text: str, max_length: int) -> str:
"""Truncate text to maximum length with ellipsis."""
if len(text) <= max_length:
return text
return text[: max_length - 3] + '...'
def prepare_agent_steps(complete_history: list[dict]) -> list[str]:
"""Extract and format agent steps, limiting each to 2000 characters."""
steps = []
for i, step in enumerate(complete_history):
step_text = f'Step {i + 1}:\n'
# Add model output if available
if step.get('model_output'):
model_output = step['model_output']
if isinstance(model_output, dict):
# Format the model output nicely
if 'action' in model_output:
step_text += f'Actions: {json.dumps(model_output["action"], indent=1)}\n'
if 'current_state' in model_output:
step_text += f'State: {model_output["current_state"]}\n'
# Add results if available
if step.get('result'):
for j, result in enumerate(step['result']):
if isinstance(result, dict):
if result.get('extracted_content'):
step_text += f'Result {j + 1}: {result["extracted_content"]}\n'
if result.get('error'):
step_text += f'Error {j + 1}: {result["error"]}\n'
# Add URL info
if step.get('state', {}).get('url'):
step_text += f'URL: {step["state"]["url"]}\n'
# Truncate to 2000 characters
steps.append(truncate_text(step_text, 2000))
return steps
async def comprehensive_judge(
task: str,
complete_history: list[dict],
final_result: str,
screenshot_paths: list[str],
model: BaseChatModel,
max_images: int = 10,
) -> JudgeResult:
"""
Comprehensive judge that evaluates browser-use agent runs with detailed structured feedback.
"""
# Prepare inputs with length limits
task_truncated = truncate_text(task, 40000)
final_result_truncated = truncate_text(final_result or 'No final result', 40000)
agent_steps = prepare_agent_steps(complete_history)
# Select last N images
selected_images = screenshot_paths[-max_images:] if screenshot_paths else []
# Encode images
encoded_images = []
for img_path in selected_images:
if Path(img_path).exists():
encoded_img = encode_image(img_path)
if encoded_img:
encoded_images.append(
{
'type': 'image_url',
'image_url': {
'url': f'data:image/jpeg;base64,{encoded_img}',
'detail': 'high',
},
}
)
# Construct the evaluation prompt
system_prompt = """You are an expert judge evaluating browser automation agent performance.
Your task is to comprehensively analyze the agent's execution and provide structured feedback.
**EVALUATION CRITERIA:**
1. **Task Analysis**: Understand what the user wanted to accomplish
2. **Trajectory Quality**: How human-like and efficient was the solution path?
3. **Tool Usage**: How effectively were browser automation tools used?
4. **Agent Reasoning**: Quality of decision-making and problem-solving
5. **Browser Handling**: How well were browser issues handled?
6. **Final Outcome**: Did the task satisfy the user's intent?
**ERROR CATEGORIES TO CONSIDER:**
- Access & Authentication: blocked_access, captcha_challenge, login_required, rate_limited
- Tool & Action Failures: tool_misuse, invalid_parameters, action_sequence_error
- Agent Behavior: infinite_loop, stuck_pattern, poor_planning, context_loss
- Browser & Technical: element_not_found, click_failure, load_timeout, javascript_error
- Content & Understanding: misunderstood_task, format_error, content_parsing_error
- Enhanced: navigation_confusion, form_filling_error, modal_handling, iframe_issues, browser_crashes, impossible_task, missing_information
**TASK CATEGORIES TO CONSIDER:**
extraction, interaction, login, research, shopping, booking, comparison, qa_testing, form_filling, navigation, search, filtering, content_creation, file_operations, multi_step_workflow
- You can use multiple categories for the same task.
- You can also add other categories if they fit better.
**TASK CLARITY SCORE:**
- is the task very clear step by step like a recipe (high score) or very vague and uncertain (low score)
**IMPROVEMENT TIPS:**
- Think how to get this task done better. Create actionable tips - but they should be understandable for a developer who does not know the task.
- These tips will be avg across many tasks and then the most common / problemetic will be used to improve the browser-use agent.
- In browser-use we convert websites to text so that the agent can understand it. In there we mark interactive elements with [index] and then the agent can chose to interact with them and we click then the actual css selector. Sometimes this conversion is not perfect.
- After the agent takes an action it gets the new state and its previous thinking, and outputs the next action. Which we then execute again.
- So we can improve the agent system prompt, input context, tool calls to interact with the browser, or our extraction layer to convert the website to text.
- always first mention the error this would fix and then the improvement tip.
**SCORING SCALE:**
- 90-100: Excellent execution, human-like, minimal issues
- 80-89: Good execution with minor issues
- 70-79: Acceptable execution, some problems but functional
- 60-69: Poor execution with significant issues
- 1-59: Failed execution, major problems
**PASS THRESHOLD: 70%**
Respond with EXACTLY this JSON structure (no additional text):
{
"task_summary": "One sentence summary of what the task was trying to accomplish",
"task_categories": ["category1", "category2"],
"task_clarity_score": 85,
"reasoning": "Detailed analysis of what went well and what didn't, trajectory quality, planning assessment",
"error_categories": ["error1", "error2"],
"scores": {
"trajectory_quality": 75,
"tool_calling_effectiveness": 80,
"agent_reasoning": 85,
"browser_handling": 65,
"task_satisfaction": 70
},
"final_score": 75,
"critical_issues": [
"Critical issue that must be fixed 1",
"Critical issue that must be fixed 2"
],
"improvement_tips": [
"Specific actionable improvement 1",
"Specific actionable improvement 2"
]
}"""
user_prompt = f"""**TASK:** {task_truncated}
**AGENT EXECUTION STEPS:**
{chr(10).join(agent_steps)}
**FINAL RESULT:**
{final_result_truncated}
**TOTAL STEPS:** {len(complete_history)}
**SCREENSHOTS PROVIDED:** {len(selected_images)}
Analyze this execution and respond with the exact JSON structure requested."""
# Build messages
content_parts = [{'type': 'text', 'text': user_prompt}]
content_parts.extend(encoded_images)
messages = [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': content_parts},
]
# Get structured response
try:
response = await asyncio.to_thread(model.invoke, messages)
# Parse the JSON response
# Handle both string and list content types
if isinstance(response.content, list):
response_text = str(response.content[0]) if response.content else ''
else:
response_text = str(response.content)
response_text = response_text.strip()
# Try to extract JSON if wrapped in markdown
if '```json' in response_text:
json_start = response_text.find('```json') + 7
json_end = response_text.find('```', json_start)
if json_end != -1:
response_text = response_text[json_start:json_end].strip()
elif '```' in response_text:
json_start = response_text.find('```') + 3
json_end = response_text.find('```', json_start)
if json_end != -1:
response_text = response_text[json_start:json_end].strip()
# Parse JSON
try:
result_dict = json.loads(response_text)
except json.JSONDecodeError as e:
logger.error(f'Failed to parse JSON response: {e}')
logger.error(f'Response text: {response_text}')
# Create fallback result
return create_fallback_result(task, 'Failed to parse judge response')
# Convert to structured result
return parse_judge_response(result_dict, task)
except Exception as e:
logger.error(f'Judge evaluation failed: {e}')
return create_fallback_result(task, str(e))
def parse_judge_response(result_dict: dict, task: str) -> JudgeResult:
"""Parse the LLM response into a structured JudgeResult."""
try:
# Parse task categories
task_categories = []
if 'task_categories' in result_dict:
for cat in result_dict['task_categories']:
try:
task_categories.append(TaskCategory(cat))
except ValueError:
logger.warning(f'Unknown task category: {cat}')
# Parse error categories
error_categories = []
if 'error_categories' in result_dict:
for err in result_dict['error_categories']:
try:
error_categories.append(ErrorCategory(err))
except ValueError:
logger.warning(f'Unknown error category: {err}')
# Parse scores
scores_dict = result_dict.get('scores', {})
scores = ScoreBreakdown(
trajectory_quality=scores_dict.get('trajectory_quality', 50),
tool_calling_effectiveness=scores_dict.get('tool_calling_effectiveness', 50),
agent_reasoning=scores_dict.get('agent_reasoning', 50),
browser_handling=scores_dict.get('browser_handling', 50),
task_satisfaction=scores_dict.get('task_satisfaction', 50),
)
final_score = result_dict.get('final_score', 50)
return JudgeResult(
task_summary=result_dict.get('task_summary', 'Task analysis unavailable'),
task_clarity_score=result_dict.get('task_clarity_score', 50),
task_categories=task_categories,
reasoning=result_dict.get('reasoning', 'Analysis unavailable'),
error_categories=error_categories,
scores=scores,
final_score=final_score,
passed=final_score >= 70,
improvement_tips=result_dict.get('improvement_tips', []),
critical_issues=result_dict.get('critical_issues', []),
evaluation_timestamp=datetime.now().isoformat(),
)
except Exception as e:
logger.error(f'Failed to parse judge response: {e}')
return create_fallback_result(task, 'Failed to parse structured response')
def create_fallback_result(task: str, error_msg: str) -> JudgeResult:
"""Create a fallback result when evaluation fails."""
return JudgeResult(
task_summary=f'Failed to analyze task: {task[:100]}...',
task_clarity_score=0,
task_categories=[TaskCategory.QA_TESTING],
reasoning=f'Evaluation failed: {error_msg}',
error_categories=[ErrorCategory.IMPOSSIBLE_TASK],
scores=ScoreBreakdown(
trajectory_quality=0,
tool_calling_effectiveness=0,
agent_reasoning=0,
browser_handling=0,
task_satisfaction=0,
),
final_score=0,
passed=False,
improvement_tips=['Fix evaluation system'],
critical_issues=[f'Evaluation system failure: {error_msg}'],
evaluation_timestamp=datetime.now().isoformat(),
)
async def judge_with_retry(
task: str,
complete_history: list[dict],
final_result: str,
screenshot_paths: list[str],
model: BaseChatModel,
max_retries: int = 3,
max_images: int = 10,
) -> JudgeResult:
"""
Judge with retry logic for robustness.
"""
for attempt in range(max_retries):
try:
return await comprehensive_judge(
task,
complete_history,
final_result,
screenshot_paths,
model,
max_images,
)
except Exception as e:
if attempt == max_retries - 1:
logger.error(f'Judge failed after {max_retries} attempts: {e}')
return create_fallback_result(task, str(e))
logger.warning(f'Judge attempt {attempt + 1} failed, retrying: {e}')
await asyncio.sleep(2**attempt)
# Fallback return (should never reach here given the logic above, but ensures type safety)
return create_fallback_result(task, 'Max retries exceeded without proper error handling')
def get_example_json_structure() -> dict:
"""Get an example of the expected JSON response structure for the LLM judge."""
return {
'task_summary': 'Extract product prices from an e-commerce website',
'task_clarity_score': 85,
'task_categories': ['extraction', 'research'],
'reasoning': 'The agent successfully navigated to the target website and extracted most product information. However, it had difficulty with dynamic loading elements and missed some prices that loaded asynchronously. The overall approach was logical but could benefit from better wait strategies.',
'error_categories': ['element_not_found', 'load_timeout'],
'scores': {
'trajectory_quality': 75,
'tool_calling_effectiveness': 80,
'agent_reasoning': 85,
'browser_handling': 65,
'task_satisfaction': 70,
},
'final_score': 75,
'critical_issues': [
'Missing wait for dynamic content to load',
'No fallback strategy when primary selectors fail',
],
'improvement_tips': [
'Browser not loaded: Implement better wait strategies for dynamic content',
'Element not found: Add retry logic for element detection',
'No error message: Improve error handling for the tool click element',
],
}
def _read_result_file(result_file: Path) -> dict[str, Any]:
"""Helper function to read result file synchronously."""
with open(result_file) as f:
return json.load(f)
def _write_result_file(result_file: Path, result_data: dict[str, Any]) -> None:
"""Helper function to write result file synchronously."""
with open(result_file, 'w') as f:
f.write(json.dumps(result_data, indent=2, default=str))
# Integration helper function
async def evaluate_task_with_comprehensive_judge(task_folder: Path, model: BaseChatModel, max_images: int = 10) -> dict[str, Any]:
"""
Evaluate a task result using the comprehensive judge system.
Returns a dictionary with both the old format for compatibility
and the new comprehensive analysis.
"""
result_file = task_folder / 'result.json'
if not result_file.exists():
return {
'task_id': task_folder.name,
'comprehensive_judge': None,
'error': 'No result.json found',
}
try:
# Load existing result using async wrapper
result_data = await asyncio.to_thread(_read_result_file, result_file)
# Check if comprehensive judge result already exists
if result_data.get('comprehensive_judge_evaluation'):
return {
'task_id': task_folder.name,
'comprehensive_judge': result_data['comprehensive_judge_evaluation'],
'error': None,
}
# Extract data for evaluation
task = result_data.get('task', 'Unknown task')
complete_history = result_data.get('complete_history', [])
final_result = result_data.get('final_result_response', '')
screenshot_paths = result_data.get('screenshot_paths', [])
# Run comprehensive evaluation
judge_result = await judge_with_retry(
task=task,
complete_history=complete_history,
final_result=final_result,
screenshot_paths=screenshot_paths,
model=model,
max_images=max_images,
)
# Convert to dict for storage
judge_dict = asdict(judge_result)
# Save back to result file using async wrapper
result_data['comprehensive_judge_evaluation'] = judge_dict
await asyncio.to_thread(_write_result_file, result_file, result_data)
return {
'task_id': task_folder.name,
'comprehensive_judge': judge_dict,
'error': None,
}
except Exception as e:
logger.error(f'Comprehensive judge evaluation failed for {task_folder.name}: {e}')
return {
'task_id': task_folder.name,
'comprehensive_judge': None,
'error': str(e),
}

File diff suppressed because it is too large Load Diff

View File

@@ -12,14 +12,17 @@ load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.session import BrowserSession
async def main():
browser_session = BrowserSession(
keep_alive=True,
user_data_dir=None,
headless=False,
browser_profile=BrowserProfile(
keep_alive=True,
user_data_dir=None,
headless=False,
)
)
await browser_session.start()

View File

@@ -1,3 +1,4 @@
# pyright: reportMissingImports=false
import asyncio
import os
import shutil
@@ -14,6 +15,7 @@ from imgcat import imgcat
from langchain_openai import ChatOpenAI
from browser_use.browser import BrowserSession
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.types import async_patchright
llm = ChatOpenAI(model='gpt-4o')
@@ -28,11 +30,13 @@ async def main():
# Default Playwright Chromium Browser
normal_browser_session = BrowserSession(
# executable_path=<defaults to playwright builtin browser stored in ms-cache directory>,
user_data_dir=None,
headless=False,
stealth=False,
# deterministic_rendering=False,
# disable_security=False,
browser_profile=BrowserProfile(
user_data_dir=None,
headless=False,
stealth=False,
# deterministic_rendering=False,
# disable_security=False,
)
)
await normal_browser_session.start()
await normal_browser_session.create_new_tab('https://abrahamjuliot.github.io/creepjs/')
@@ -45,11 +49,13 @@ async def main():
patchright_browser_session = BrowserSession(
# cdp_url='wss://browser.zenrows.com?apikey=your-api-key-here&proxy_region=na',
# or try anchor browser, browserless, steel.dev, browserbase, oxylabs, brightdata, etc.
user_data_dir='~/.config/browseruse/profiles/stealth',
stealth=True,
headless=False,
disable_security=False,
deterministic_rendering=False,
browser_profile=BrowserProfile(
user_data_dir='~/.config/browseruse/profiles/stealth',
stealth=True,
headless=False,
disable_security=False,
deterministic_rendering=False,
)
)
await patchright_browser_session.start()
await patchright_browser_session.create_new_tab('https://abrahamjuliot.github.io/creepjs/')
@@ -62,11 +68,13 @@ async def main():
if Path('/Applications/Brave Browser.app/Contents/MacOS/Brave Browser').is_file():
print('\n\nBRAVE BROWSER:')
brave_browser_session = BrowserSession(
executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
headless=False,
disable_security=False,
user_data_dir='~/.config/browseruse/profiles/brave',
deterministic_rendering=False,
browser_profile=BrowserProfile(
executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
headless=False,
disable_security=False,
user_data_dir='~/.config/browseruse/profiles/brave',
deterministic_rendering=False,
)
)
await brave_browser_session.start()
await brave_browser_session.create_new_tab('https://abrahamjuliot.github.io/creepjs/')
@@ -78,12 +86,14 @@ async def main():
if Path('/Applications/Brave Browser.app/Contents/MacOS/Brave Browser').is_file():
print('\n\nBRAVE + PATCHRIGHT STEALTH BROWSER:')
brave_patchright_browser_session = BrowserSession(
executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
playwright=patchright,
headless=False,
disable_security=False,
user_data_dir=None,
deterministic_rendering=False,
browser_profile=BrowserProfile(
executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
headless=False,
disable_security=False,
user_data_dir=None,
deterministic_rendering=False,
),
# **patchright.devices['iPhone 13'], # emulate other devices: https://playwright.dev/python/docs/emulation
)
await brave_patchright_browser_session.start()

View File

@@ -25,14 +25,16 @@ from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import Agent, Controller
from browser_use.browser import BrowserSession
from browser_use.browser import BrowserProfile, BrowserSession
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
browser_session = BrowserSession(
headless=False,
browser_profile=BrowserProfile(
headless=False,
),
cdp_url='http://localhost:9222',
)
controller = Controller()
@@ -41,6 +43,8 @@ controller = Controller()
async def main():
task = 'In docs.google.com write my Papa a quick thank you for everything letter \n - Magnus'
task += ' and save the document as pdf'
# Assert api_key is not None to satisfy type checker
assert api_key is not None, 'GOOGLE_API_KEY must be set'
model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
agent = Agent(
task=task,

View File

@@ -53,12 +53,15 @@ async def example_custom_window_size():
actual_content_size = await page.evaluate("""() => ({width: window.innerWidth, height: window.innerHeight})""")
if profile.viewport:
expected_page_size = profile.viewport
expected_page_size = dict(profile.viewport)
elif profile.window_size:
expected_page_size = {
'width': profile.window_size['width'],
'height': profile.window_size['height'] - 87,
} # 87px is the height of the navbar, title, rim ish
else:
# Default expected size if neither viewport nor window_size is set
expected_page_size = {'width': 800, 'height': 600}
_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
print(f'Expected {_log_size(expected_page_size)} vs actual {_log_size(actual_content_size)}')
@@ -95,7 +98,10 @@ async def example_no_viewport_option():
# Get viewport size (inner dimensions)
viewport = await page.evaluate('() => ({width: window.innerWidth, height: window.innerHeight})')
print(f'Configured size: width={profile.window_size["width"]}, height={profile.window_size["height"]}')
if profile.window_size:
print(f'Configured size: width={profile.window_size["width"]}, height={profile.window_size["height"]}')
else:
print('No window size configured')
print(f'Actual viewport size: {viewport}')
# Get the actual window size (outer dimensions)
@@ -118,7 +124,11 @@ async def example_no_viewport_option():
def validate_window_size(configured: dict[str, Any], actual: dict[str, Any]) -> None:
"""Compare configured window size with actual size and report differences"""
"""Compare configured window size with actual size and report differences.
Raises:
Exception: If the window size difference exceeds tolerance
"""
# Allow for small differences due to browser chrome, scrollbars, etc.
width_diff = abs(configured['width'] - actual['width'])
height_diff = abs(configured['height'] - actual['height'])
@@ -133,6 +143,8 @@ def validate_window_size(configured: dict[str, Any], actual: dict[str, Any]) ->
else:
print('✅ Window size validation passed: actual size matches configured size within tolerance')
return None
async def main():
"""Run all window sizing examples"""

View File

@@ -9,7 +9,7 @@ from dotenv import load_dotenv
load_dotenv()
import pyotp
import pyotp # type: ignore
from langchain_openai import ChatOpenAI
from browser_use import ActionResult, Agent, Controller

View File

@@ -1,5 +1,5 @@
import asyncio
import http
import http.client
import json
import os
import sys

View File

@@ -46,7 +46,7 @@ def b64_to_png(b64_string: str, output_file):
import json
from pathlib import Path
import prettyprinter
import prettyprinter # type: ignore
from fastapi import FastAPI, Request
prettyprinter.install_extras()
@@ -124,7 +124,7 @@ load_dotenv()
import requests
from langchain_openai import ChatOpenAI
from pyobjtojson import obj_to_json
from pyobjtojson import obj_to_json # type: ignore
from browser_use import Agent
@@ -148,14 +148,13 @@ async def record_activity(agent_obj):
extracted_content_json_last_elem = None
print('--- ON_STEP_START HOOK ---')
website_html: str = await agent_obj.browser_context.get_page_html()
website_screenshot: str = await agent_obj.browser_context.take_screenshot()
website_html = await agent_obj.browser_context.get_page_html()
website_screenshot = await agent_obj.browser_context.take_screenshot()
print('--> History:')
if hasattr(agent_obj, 'state'):
history = agent_obj.state.history
else:
history = None
# Assert agent has state to satisfy type checker
assert hasattr(agent_obj, 'state'), 'Agent must have state attribute'
history = agent_obj.state.history
model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = ["browser-use", "mistralai"]
# ///
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import asyncio
import logging
from langchain_openai import ChatOpenAI
from mistralai import Mistral # type: ignore
from pydantic import BaseModel, Field
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.context import BrowserContext
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
if not os.getenv('MISTRAL_API_KEY'):
raise ValueError('MISTRAL_API_KEY is not set. Please add it to your environment variables.')
logger = logging.getLogger(__name__)
controller = Controller()
class PdfExtractParams(BaseModel):
url: str = Field(description='URL to a PDF document')
@controller.registry.action(
'Extract PDF Text',
param_model=PdfExtractParams,
)
def extract_mistral_ocr(params: PdfExtractParams, browser: BrowserContext) -> ActionResult:
"""
Process a PDF URL using Mistral OCR API and return the OCR response.
Args:
url: URL to a PDF document
Returns:
OCR response object from Mistral API
"""
api_key = os.getenv('MISTRAL_API_KEY')
client = Mistral(api_key=api_key)
response = client.ocr.process(
model='mistral-ocr-latest',
document={
'type': 'document_url',
'document_url': params.url,
},
include_image_base64=False,
)
markdown = '\n\n'.join(f'### Page {i + 1}\n{response.pages[i].markdown}' for i in range(len(response.pages)))
return ActionResult(
extracted_content=markdown,
include_in_memory=False, ## PDF content can be very large, so we don't include it in memory
)
async def main():
agent = Agent(
task="""
Objective: Navigate to the following URL, extract its contents using the Extract PDF Text action, and explain its historical significance.
URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf
""",
llm=ChatOpenAI(model='gpt-4o'),
controller=controller,
)
result = await agent.run()
logger.info(result)
if __name__ == '__main__':
asyncio.run(main())

View File

@@ -10,7 +10,13 @@ from dotenv import load_dotenv
load_dotenv()
import anyio
from lmnr import Laminar
try:
Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
except Exception:
pass
from langchain_openai import ChatOpenAI
from browser_use import Agent, Controller
@@ -22,9 +28,7 @@ logger = logging.getLogger(__name__)
controller = Controller()
@controller.action(
'Upload file to interactive element with file path ',
)
@controller.action('Upload file to interactive element with file path')
async def upload_file(index: int, path: str, browser_session: BrowserSession, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')
@@ -57,18 +61,6 @@ async def upload_file(index: int, path: str, browser_session: BrowserSession, av
return ActionResult(error=msg)
@controller.action('Read the file content of a file given a path')
async def read_file(path: str, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')
async with await anyio.open_file(path, 'r') as f:
content = await f.read()
msg = f'File content: {content}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
def create_file(file_type: str = 'txt'):
with open(f'tmp.{file_type}', 'w') as f:
f.write('test')
@@ -79,11 +71,10 @@ def create_file(file_type: str = 'txt'):
async def main():
task = 'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields'
task = 'Go to https://www.freepdfconvert.com/, upload the file tmp.pdf into the field choose a file - dont click the fileupload button'
task = 'Go to https://www.freepdfconvert.com/, upload the file tmp.pdf into the field choose a file - dont click the fileupload button'
available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')]
model = ChatOpenAI(model='gpt-4o')
model = ChatOpenAI(model='gpt-4.1-mini')
agent = Agent(
task=task,
llm=model,

View File

@@ -17,7 +17,7 @@ controller = Controller()
@controller.registry.action('Done with task ')
async def done(text: str):
import yagmail
import yagmail # type: ignore
# To send emails use
# STEP 1: go to https://support.google.com/accounts/answer/185833

View File

@@ -10,7 +10,7 @@ from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from onepassword.client import Client # pip install onepassword-sdk
from onepassword.client import Client # type: ignore # pip install onepassword-sdk
from browser_use import ActionResult, Agent, Controller

View File

@@ -8,7 +8,7 @@ from dotenv import load_dotenv
load_dotenv()
from amazoncaptcha import AmazonCaptcha
from amazoncaptcha import AmazonCaptcha # type: ignore
from langchain_openai import ChatOpenAI
from browser_use import ActionResult

View File

@@ -59,19 +59,22 @@ async def run_agent_with_memory_config(
# Let's refine how to access summaries. The summary is added as a 'memory' type message.
summaries_created = []
for step_messages in agent.message_manager.state.history.get_messages():
if isinstance(step_messages, list):
for msg in step_messages:
if (
hasattr(msg, 'additional_kwargs')
and msg.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
):
summaries_created.append(msg.content)
elif (
hasattr(step_messages, 'additional_kwargs')
and step_messages.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
): # if it's a list of messages
summaries_created.append(step_messages.content)
for item in agent.message_manager.state.history.get_messages():
# get_messages() returns tuples of (step_number, messages)
if isinstance(item, tuple) and len(item) == 2:
step_number, step_messages = item
if isinstance(step_messages, list):
for msg in step_messages:
if (
hasattr(msg, 'additional_kwargs')
and msg.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
):
summaries_created.append(msg.content)
elif (
hasattr(step_messages, 'additional_kwargs')
and step_messages.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
):
summaries_created.append(step_messages.content)
if summaries_created:
print('\nProcedural Summaries Created during run:')
@@ -169,5 +172,9 @@ if __name__ == '__main__':
import sys
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
# WindowsProactorEventLoopPolicy is only available on Windows
try:
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) # type: ignore
except AttributeError:
pass # Not on Windows, ignore
asyncio.run(main())

View File

@@ -18,11 +18,16 @@ api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
assert api_key is not None, 'GOOGLE_API_KEY must be set'
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
from browser_use.browser import BrowserProfile
browser_session = BrowserSession(
downloads_path='~/Downloads',
user_data_dir='~/.config/browseruse/profiles/default',
browser_profile=BrowserProfile(
downloads_path='~/Downloads',
user_data_dir='~/.config/browseruse/profiles/default',
)
)

View File

@@ -11,13 +11,15 @@ load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use.agent.service import Agent
from browser_use.browser import BrowserSession
from browser_use.browser import BrowserProfile, BrowserSession
browser_session = BrowserSession(
keep_alive=True,
headless=False,
save_recording_path='./tmp/recordings',
user_data_dir='~/.config/browseruse/profiles/default',
browser_profile=BrowserProfile(
keep_alive=True,
headless=False,
record_video_dir='./tmp/recordings',
user_data_dir='~/.config/browseruse/profiles/default',
)
)
llm = ChatOpenAI(model='gpt-4o')

View File

@@ -22,7 +22,7 @@ async def main():
async with BrowserSession(
browser_profile=BrowserProfile(
headless=False,
trace_path='./tmp/result_processing',
traces_dir='./tmp/result_processing',
window_size={'width': 1280, 'height': 1000},
user_data_dir='~/.config/browseruse/profiles/default',
)

View File

@@ -19,7 +19,7 @@ llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
async def main():
browser_session = BrowserSession(
browser_profile=BrowserProfile(
trace_path='./tmp/traces/',
traces_dir='./tmp/traces/',
user_data_dir='~/.config/browseruse/profiles/default',
)
)

View File

@@ -11,6 +11,7 @@ load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.browser import BrowserProfile
# Initialize the model
llm = ChatOpenAI(
@@ -25,7 +26,8 @@ llm = ChatOpenAI(
company_credentials = {'company_username': 'user@example.com', 'company_password': 'securePassword123'}
# Map the same credentials to multiple domains for secure access control
sensitive_data = {
# Type annotation to satisfy pyright
sensitive_data: dict[str, str | dict[str, str]] = {
'https://example.com': company_credentials,
'https://admin.example.com': company_credentials,
'https://*.example-staging.com': company_credentials,
@@ -40,8 +42,10 @@ task = 'Go to example.com and login with company_username and company_password'
from browser_use.browser.session import BrowserSession
browser_session = BrowserSession(
allowed_domains=list(sensitive_data.keys())
+ ['https://*.trusted-partner.com'] # Domain patterns from sensitive_data + additional allowed domains
browser_profile=BrowserProfile(
allowed_domains=list(sensitive_data.keys())
+ ['https://*.trusted-partner.com'] # Domain patterns from sensitive_data + additional allowed domains
)
)
agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data, browser_session=browser_session)

View File

@@ -1,3 +1,16 @@
"""
EXPERIMENTAL: Integration example with Stagehand (browserbase)
This example shows how to combine browser-use with Stagehand for advanced browser automation.
Note: This requires the stagehand-py library to be installed separately:
pip install stagehand-py
The exact API may vary depending on the stagehand-py version.
Please refer to the official Stagehand documentation for the latest usage:
https://pypi.org/project/stagehand-py/
https://github.com/browserbase/stagehand-python-examples/
"""
import asyncio
import os
@@ -5,7 +18,7 @@ from dotenv import load_dotenv
load_dotenv()
from stagehand import Stagehand, StagehandConfig
from stagehand import Stagehand, StagehandConfig # type: ignore
from browser_use.agent.service import Agent
@@ -14,18 +27,14 @@ async def main():
# Configure Stagehand
# https://pypi.org/project/stagehand-py/
# https://github.com/browserbase/stagehand-python-examples/blob/main/agent_example.py
config = StagehandConfig(
env='BROWSERBASE',
api_key=os.getenv('BROWSERBASE_API_KEY'),
project_id=os.getenv('BROWSERBASE_PROJECT_ID'),
headless=False,
dom_settle_timeout_ms=3000,
model_name='gpt-4o',
self_heal=True,
wait_for_captcha_solves=True,
system_prompt='You are a browser automation assistant that helps users navigate websites effectively.',
model_client_options={'model_api_key': os.getenv('OPENAI_API_KEY')},
verbose=2,
# Note: This example requires the stagehand-py library to be installed
# pip install stagehand-py
# Create StagehandConfig with correct parameters
# The exact parameters depend on the stagehand-py version
config = StagehandConfig( # type: ignore
apiKey=os.getenv('BROWSERBASE_API_KEY'),
projectId=os.getenv('BROWSERBASE_PROJECT_ID'),
)
# Create a Stagehand client using the configuration object.
@@ -40,18 +49,21 @@ async def main():
print(f'\nCreated new session: {stagehand.session_id}')
print(f'🌐 View your live browser: https://www.browserbase.com/sessions/{stagehand.session_id}')
await stagehand.page.goto('https://google.com/')
await stagehand.page.act('search for openai')
# Check if stagehand has a page attribute
if hasattr(stagehand, 'page') and stagehand.page:
await stagehand.page.goto('https://google.com/')
await stagehand.page.act('search for openai')
else:
print('Warning: Stagehand page not available')
# Combine with Browser Use
agent = Agent(task='click the first result', page=stagehand.page)
agent = Agent(task='click the first result', page=stagehand.page) # type: ignore
await agent.run()
# go back and forth
await stagehand.page.act('open the 3 first links on the page in new tabs')
await stagehand.page.act('open the 3 first links on the page in new tabs') # type: ignore
await Agent(task='click the first result', page=stagehand.page).run()
await Agent(task='click the first result', page=stagehand.page).run() # type: ignore
if __name__ == '__main__':

View File

@@ -7,8 +7,8 @@ from dotenv import load_dotenv
load_dotenv()
import discord
from discord.ext import commands
import discord # type: ignore
from discord.ext import commands # type: ignore
from langchain_core.language_models.chat_models import BaseChatModel
from browser_use.agent.service import Agent
@@ -56,7 +56,7 @@ class DiscordBot(commands.Bot):
self.browser_profile = browser_profile
# Define intents.
intents = discord.Intents.default()
intents = discord.Intents.default() # type: ignore
intents.message_content = True # Enable message content intent
intents.members = True # Enable members intent for user info

View File

@@ -11,9 +11,9 @@ load_dotenv()
from fastapi import Depends, FastAPI, HTTPException, Request
from langchain_core.language_models.chat_models import BaseChatModel
from slack_sdk.errors import SlackApiError
from slack_sdk.signature import SignatureVerifier
from slack_sdk.web.async_client import AsyncWebClient
from slack_sdk.errors import SlackApiError # type: ignore
from slack_sdk.signature import SignatureVerifier # type: ignore
from slack_sdk.web.async_client import AsyncWebClient # type: ignore
from browser_use.agent.service import Agent
from browser_use.browser import BrowserProfile, BrowserSession

View File

@@ -15,6 +15,7 @@ from dotenv import load_dotenv
load_dotenv()
from langchain_openai import AzureChatOpenAI
from pydantic import SecretStr
from browser_use import Agent
@@ -28,7 +29,7 @@ if not azure_openai_api_key or not azure_openai_endpoint:
# Initialize the Azure OpenAI client
llm = AzureChatOpenAI(
model='gpt-4o',
api_key=azure_openai_api_key,
api_key=SecretStr(azure_openai_api_key) if azure_openai_api_key else None,
azure_endpoint=azure_openai_endpoint, # Corrected to use azure_endpoint instead of openai_api_base
api_version='2024-08-01-preview', # Explicitly set the API version here
)

View File

@@ -1,3 +1,4 @@
# pyright: reportMissingImports=false
"""
Automated news analysis and sentiment scoring using Bedrock.
@@ -17,9 +18,9 @@ from dotenv import load_dotenv
load_dotenv()
import boto3
import boto3 # type: ignore
from botocore.config import Config
from langchain_aws import ChatBedrockConverse
from langchain_aws import ChatBedrockConverse # type: ignore
from browser_use import Agent
from browser_use.browser import BrowserProfile, BrowserSession

View File

@@ -9,13 +9,15 @@ from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import SecretStr
from browser_use import Agent
groq_api_key = os.environ.get('GROQ_API_KEY')
llm = ChatOpenAI(
model='meta-llama/llama-4-maverick-17b-128e-instruct',
base_url='https://api.groq.com/openai/v1',
api_key=os.environ.get('GROQ_API_KEY'),
api_key=SecretStr(groq_api_key) if groq_api_key else None,
temperature=0.0,
)

View File

@@ -1,3 +1,4 @@
# pyright: reportMissingImports=false
import asyncio
import os
import sys
@@ -10,7 +11,7 @@ from dotenv import load_dotenv
load_dotenv()
# Third-party imports
import gradio as gr
import gradio as gr # type: ignore
from langchain_openai import ChatOpenAI
from rich.console import Console
from rich.panel import Panel
@@ -52,6 +53,8 @@ def parse_agent_history(history_str: str) -> None:
console.print(panel)
console.print()
return None
async def run_browser_task(
task: str,
@@ -70,8 +73,8 @@ async def run_browser_task(
llm=ChatOpenAI(model='gpt-4o'),
)
result = await agent.run()
# TODO: The result cloud be parsed better
return result
# TODO: The result could be parsed better
return str(result)
except Exception as e:
return f'Error: {str(e)}'

View File

@@ -15,7 +15,7 @@ from dotenv import load_dotenv
load_dotenv()
import streamlit as st
import streamlit as st # type: ignore
from browser_use import Agent
from browser_use.browser import BrowserSession
@@ -48,6 +48,7 @@ def get_llm(provider: str):
else:
st.error(f'Unsupported provider: {provider}')
st.stop()
return None # Never reached, but helps with type checking
# Function to initialize the agent
@@ -58,7 +59,7 @@ def initialize_agent(query: str, provider: str):
return Agent(
task=query,
llm=llm,
llm=llm, # type: ignore
controller=controller,
browser_session=browser_session,
use_vision=True,

View File

@@ -20,7 +20,7 @@ load_dotenv()
from langchain_openai import AzureChatOpenAI
from pydantic import BaseModel, SecretStr
from PyPDF2 import PdfReader
from PyPDF2 import PdfReader # type: ignore
from browser_use import ActionResult, Agent, Controller
from browser_use.browser import BrowserProfile, BrowserSession

View File

@@ -39,8 +39,8 @@ async def main():
browser_profile=BrowserProfile(
executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
user_data_dir='~/.config/browseruse/profiles/default',
keep_alive=True,
),
keep_alive=True,
)
async with browser_session:

View File

@@ -16,7 +16,7 @@ load_dotenv()
import asyncio
import logging
import chess
import chess # type: ignore
from bs4 import BeautifulSoup
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
@@ -64,7 +64,7 @@ def parse_transform(style: str) -> tuple[float, float] | None:
return x_px_str, y_px_str
except Exception as e:
logger.error(f'Error parsing transform style: {e}')
return None, None
return None
def algebraic_to_pixels(square: str, square_size: float) -> tuple[str, str]:
@@ -107,9 +107,12 @@ async def calculate_square_size(page) -> float | None:
raise ValueError('No pieces found.')
x_coords: set[float] = set()
for piece in pieces:
style = piece.get('style')
if hasattr(piece, 'get'):
style = piece.get('style') # type: ignore
else:
continue
if style:
coords = parse_transform(style)
coords = parse_transform(style) # type: ignore
if coords:
x_coords.add(coords[0])
@@ -151,7 +154,7 @@ def create_fen_board(board_state: dict) -> str:
return fen
async def get_current_board_info(page) -> tuple[str | None, float]:
async def get_current_board_info(page) -> tuple[str | None, float | None]:
"""Reads the current board HTML and returns FEN string and square size."""
board_state = {}
board_html = ''
@@ -172,16 +175,18 @@ async def get_current_board_info(page) -> tuple[str | None, float]:
soup = BeautifulSoup(board_html, 'html.parser')
pieces = soup.find_all('piece')
for piece in pieces:
style = piece.get('style')
class_ = piece.get('class')
if not hasattr(piece, 'get'):
continue
style = piece.get('style') # type: ignore
class_ = piece.get('class') # type: ignore
if style and class_:
coords = parse_transform(style)
coords = parse_transform(style) # type: ignore
if coords:
x_px, y_px = coords
try:
square = pixels_to_algebraic(x_px, y_px, square_size)
board_state[square] = get_piece_symbol(class_)
board_state[square] = get_piece_symbol(class_) # type: ignore
except ValueError as ve:
logger.error(f'Error: {ve}')
@@ -257,7 +262,7 @@ async def play_move(params: PlayMoveParams, browser: BrowserContext):
try:
current_fen, square_size = await get_current_board_info(page)
if not current_fen or not square_size:
if not current_fen or square_size is None:
return ActionResult(extracted_content='Failed to get current FEN or square size to play move.')
board = chess.Board(current_fen)

View File

@@ -55,8 +55,11 @@ class TwitterConfig:
# Customize these settings
openai_key = os.getenv('OPENAI_API_KEY')
assert openai_key is not None, 'OPENAI_API_KEY must be set'
config = TwitterConfig(
openai_api_key=os.getenv('OPENAI_API_KEY'),
openai_api_key=openai_key,
chrome_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # This is for MacOS (Chrome)
target_user='XXXXX',
message='XXXXX',
@@ -66,7 +69,9 @@ config = TwitterConfig(
def create_twitter_agent(config: TwitterConfig) -> Agent:
llm = ChatOpenAI(model=config.model, api_key=config.openai_api_key)
from pydantic import SecretStr
llm = ChatOpenAI(model=config.model, api_key=SecretStr(config.openai_api_key))
browser_profile = BrowserProfile(
headless=config.headless,
@@ -80,7 +85,7 @@ def create_twitter_agent(config: TwitterConfig) -> Agent:
full_message = f'@{config.target_user} {config.message}'
# Create the agent with detailed instructions
return Agent(
agent = Agent(
task=f"""Navigate to Twitter and create a post and reply to a tweet.
Here are the specific steps:
@@ -106,12 +111,12 @@ def create_twitter_agent(config: TwitterConfig) -> Agent:
controller=controller,
browser_session=browser_session,
)
return agent
async def post_tweet(agent: Agent):
try:
await agent.run(max_steps=100)
agent.create_history_gif()
print('Tweet posted successfully!')
except Exception as e:
print(f'Error posting tweet: {str(e)}')

View File

@@ -2,7 +2,7 @@
name = "browser-use"
description = "Make websites accessible for AI agents"
authors = [{ name = "Gregor Zunic" }]
version = "0.3.1"
version = "0.3.2"
readme = "README.md"
requires-python = ">=3.11,<4.0"
classifiers = [
@@ -13,8 +13,7 @@ classifiers = [
dependencies = [
"aiofiles>=24.1.0",
"anyio>=4.9.0",
"bubus>=1.1.0",
"faiss-cpu>=1.11.0",
"bubus>=1.1.2",
"google-api-core>=2.25.0",
"httpx>=0.28.1",
"langchain-anthropic==0.3.15",
@@ -53,6 +52,8 @@ dependencies = [
[project.optional-dependencies]
memory = [
# sentence-transformers: depends on pytorch, which does not support python 3.13 yet
# faiss-cpu: >= 1.11.0 breaks on some macOS hosts, make sure to test before upgrading
"faiss-cpu>=1.10.0",
"sentence-transformers>=4.0.2",
]
cli = [
@@ -63,7 +64,7 @@ cli = [
examples = [
# botocore: only needed for Bedrock Claude boto3 examples/models/bedrock_claude.py
"botocore>=1.37.23",
"langchain-aws>=0.2.24",
# "langchain-aws>=0.2.24", # depends on version of numpy that doesnt have python 3.12 wheels yet, breaks CI
"imgcat>=0.6.0",
"stagehand-py>=0.3.6",
"browserbase>=0.4.0",
@@ -114,7 +115,9 @@ docstring-code-line-length = 140
skip-magic-trailing-comma = false
[tool.pyright]
typeCheckingMode = "off"
typeCheckingMode = "basic"
exclude = ["tests/old/", ".venv/", ".git/", "__pycache__/"]
[tool.hatch.build]
include = [
@@ -123,6 +126,7 @@ include = [
"!browser_use/**/tests.py",
"browser_use/agent/system_prompt.md",
"browser_use/dom/buildDomTree.js",
"!tests/**/*.py",
]
[tool.pytest.ini_options]
@@ -153,6 +157,15 @@ log_level = "DEBUG"
allow-direct-references = true
[tool.uv]
# required-environments = [
# "sys_platform == 'darwin' and platform_machine == 'arm64'",
# "sys_platform == 'darwin' and platform_machine == 'x86_64'",
# "sys_platform == 'linux' and platform_machine == 'x86_64'",
# "sys_platform == 'linux' and platform_machine == 'aarch64'",
# # "sys_platform == 'linux' and platform_machine == 'arm64'", # no pytorch wheels available yet
# "sys_platform == 'win32' and platform_machine == 'x86_64'",
# # "sys_platform == 'win32' and platform_machine == 'arm64'", # no pytorch wheels available yet
# ]
dev-dependencies = [
"ruff>=0.11.2",
"tokencost>=0.1.16",
@@ -169,6 +182,6 @@ dev-dependencies = [
"codespell>=2.4.1",
"pyright>=1.1.399",
"ty>=0.0.1a1",
"pytest-xdist>=3.7.0"
"pytest-xdist>=3.7.0",
# "pytest-playwright-asyncio>=0.7.0", # not actually needed I think
]

226
tests/ci/conftest.py Normal file
View File

@@ -0,0 +1,226 @@
"""
Pytest configuration for browser-use CI tests.
Sets up environment variables to ensure tests never connect to production services.
"""
import os
import tempfile
from unittest.mock import AsyncMock, MagicMock
import pytest
from dotenv import load_dotenv
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage
from pytest_httpserver import HTTPServer
# Load environment variables before any imports
load_dotenv()
# Skip LLM API key verification for tests
os.environ['SKIP_LLM_API_KEY_VERIFICATION'] = 'true'
from bubus import BaseEvent
from browser_use import Agent
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.sync.service import CloudSync
@pytest.fixture(autouse=True)
def setup_test_environment():
"""
Automatically set up test environment for all tests.
"""
# Create a temporary directory for test config
config_dir = tempfile.mkdtemp(prefix='browseruse_tests_')
original_env = {}
test_env_vars = {
'SKIP_LLM_API_KEY_VERIFICATION': 'true',
'ANONYMIZED_TELEMETRY': 'false',
'BROWSER_USE_CLOUD_SYNC': 'true',
'BROWSER_USE_CLOUD_API_URL': 'http://placeholder-will-be-replaced-by-specific-test-fixtures',
'BROWSER_USE_CLOUD_UI_URL': 'http://placeholder-will-be-replaced-by-specific-test-fixtures',
'BROWSER_USE_CONFIG_DIR': config_dir,
}
for key, value in test_env_vars.items():
original_env[key] = os.environ.get(key)
os.environ[key] = value
yield
# Restore original environment
for key, value in original_env.items():
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = value
# not a fixture, mock_llm() provides this in a fixture below, this is a helper so that it can accept args
def create_mock_llm(actions=None):
"""Create a mock LLM that returns specified actions or a default done action.
Args:
actions: Optional list of JSON strings representing actions to return in sequence.
If not provided, returns a single done action.
After all actions are exhausted, returns a done action.
Returns:
Mock LLM that will return the actions in order, or just a done action if no actions provided.
"""
llm = AsyncMock(spec=BaseChatModel)
llm.model_name = 'mock-llm'
llm._verified_api_keys = True
llm._verified_tool_calling_method = 'raw'
# llm._verified_tool_calling_method = 'function_calling'
# Default done action
default_done_action = """
{
"thinking": "null",
"evaluation_previous_goal": "Successfully completed the task",
"memory": "Task completed",
"next_goal": "Task completed",
"action": [
{
"done": {
"text": "Task completed successfully",
"success": true
}
}
]
}
"""
if actions is None:
# No actions provided, just return done action
async def async_invoke(*args, **kwargs):
return AIMessage(content=default_done_action)
llm.invoke.return_value = AIMessage(content=default_done_action)
llm.ainvoke.side_effect = async_invoke
else:
# Actions provided, return them in sequence with structured output support
action_index = 0
def get_next_action():
nonlocal action_index
if action_index < len(actions):
action = actions[action_index]
action_index += 1
return action
else:
return default_done_action
async def mock_ainvoke(*args, **kwargs):
return AIMessage(content=get_next_action())
def mock_invoke(*args, **kwargs):
return AIMessage(content=get_next_action())
llm.invoke.side_effect = mock_invoke
llm.ainvoke.side_effect = mock_ainvoke
# Mock the with_structured_output method to return parsed objects
structured_llm = MagicMock()
async def mock_structured_ainvoke(*args, **kwargs):
# The agent will create its own AgentOutput and ActionModel classes
# We return the raw response and let the agent parse it
return {
'raw': AIMessage(content=get_next_action()),
'parsed': None, # Let the agent parse it from the raw JSON
}
structured_llm.ainvoke = AsyncMock(side_effect=mock_structured_ainvoke)
llm.with_structured_output = lambda *args, **kwargs: structured_llm
return llm
@pytest.fixture(scope='module')
async def browser_session():
"""Create a real browser session for testing"""
session = BrowserSession(
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None, # Use temporary directory
keep_alive=True,
)
)
await session.start()
yield session
await session.stop()
@pytest.fixture(scope='function')
def cloud_sync(httpserver: HTTPServer):
"""
Create a CloudSync instance configured for testing.
This fixture creates a real CloudSync instance and sets up the test environment
to use the httpserver URLs.
"""
# Set up test environment
test_http_server_url = httpserver.url_for('')
os.environ['BROWSER_USE_CLOUD_API_URL'] = test_http_server_url
os.environ['BROWSER_USE_CLOUD_UI_URL'] = test_http_server_url
os.environ['BROWSER_USE_CLOUD_SYNC'] = 'true'
# Create CloudSync with test server URL
cloud_sync = CloudSync(
base_url=test_http_server_url,
enable_auth=False, # Disable auth for most tests, they can override this if needed
)
return cloud_sync
@pytest.fixture(scope='function')
def mock_llm():
"""Create a mock LLM that just returns the done action if queried"""
return create_mock_llm(actions=None)
@pytest.fixture(scope='function')
def agent_with_cloud(browser_session, mock_llm, cloud_sync):
"""Create agent with cloud sync enabled (using real CloudSync)."""
agent = Agent(
task='Test task',
llm=mock_llm,
browser_session=browser_session,
cloud_sync=cloud_sync,
)
return agent
@pytest.fixture(scope='function')
def event_collector():
"""Helper to collect all events emitted during tests"""
events = []
event_order = []
class EventCollector:
def __init__(self):
self.events = events
self.event_order = event_order
async def collect_event(self, event: BaseEvent):
self.events.append(event)
self.event_order.append(event.event_type)
return 'collected'
def get_events_by_type(self, event_type: str) -> list[BaseEvent]:
return [e for e in self.events if e.event_type == event_type]
def clear(self):
self.events.clear()
self.event_order.clear()
return EventCollector()

View File

@@ -141,7 +141,7 @@ Reply in JSON with keys: success (true/false), explanation (string).
If the agent provided no output, explain what might have gone wrong.
"""
structured_llm = judge_llm.with_structured_output(JudgeResponse)
judge_response = await structured_llm.ainvoke(judge_prompt)
judge_response: JudgeResponse = await structured_llm.ainvoke(judge_prompt) # type: ignore[assignment]
result = {
'file': os.path.basename(task_file),

View File

@@ -1,76 +0,0 @@
"""Mock utilities for testing browser-use."""
from unittest.mock import AsyncMock
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import AIMessage
def create_mock_llm(actions=None):
"""Create a mock LLM that returns specified actions or a default done action.
Args:
actions: Optional list of JSON strings representing actions to return in sequence.
If not provided, returns a single done action.
After all actions are exhausted, returns a done action.
Returns:
Mock LLM that will return the actions in order, or just a done action if no actions provided.
"""
mock = AsyncMock(spec=BaseChatModel)
mock._verified_api_keys = True
mock._verified_tool_calling_method = 'raw'
mock.model_name = 'mock-llm'
# Default done action
default_done_action = """
{
"thinking": "null",
"evaluation_previous_goal": "Successfully completed the task",
"memory": "Task completed",
"next_goal": "Task completed",
"action": [
{
"done": {
"text": "Task completed successfully",
"success": true
}
}
]
}
"""
if actions is None:
# No actions provided, just return done action
mock.invoke.return_value = AIMessage(content=default_done_action)
async def async_invoke(*args, **kwargs):
return AIMessage(content=default_done_action)
mock.ainvoke.side_effect = async_invoke
else:
# Actions provided, return them in sequence
action_index = 0
def get_next_action():
nonlocal action_index
if action_index < len(actions):
action = actions[action_index]
action_index += 1
return action
else:
return default_done_action
# Mock the invoke method
def mock_invoke(*args, **kwargs):
return AIMessage(content=get_next_action())
mock.invoke.side_effect = mock_invoke
# Create an async version
async def mock_ainvoke(*args, **kwargs):
return AIMessage(content=get_next_action())
mock.ainvoke.side_effect = mock_ainvoke
return mock

View File

@@ -58,9 +58,10 @@ class TestBrowserContext:
async def browser_session(self):
"""Create and provide a BrowserSession instance with security disabled."""
browser_session = BrowserSession(
# browser_profile=BrowserProfile(...),
headless=True,
user_data_dir=None,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None,
)
)
await browser_session.start()
yield browser_session
@@ -356,7 +357,7 @@ class TestBrowserContext:
assert 'simple_action' in action_model.model_fields
# Create an instance with the simple_action
action_instance = action_model(simple_action={})
action_instance = action_model(simple_action={}) # type: ignore[call-arg]
# Test that model_dump works correctly
dumped = action_instance.model_dump(exclude_unset=True)

View File

@@ -23,7 +23,7 @@ from langchain_core.messages import AIMessage
from browser_use import Agent, setup_logging
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.browser.types import async_playwright
from tests.ci.mocks import create_mock_llm
from tests.ci.conftest import create_mock_llm
# Set up test logging
setup_logging()
@@ -147,9 +147,11 @@ class TestParallelism:
# Create a shared browser session
browser_session = BrowserSession(
headless=True,
user_data_dir=None, # Use temp directory
keep_alive=True,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None, # Use temp directory
keep_alive=True,
)
)
try:
@@ -197,9 +199,11 @@ class TestParallelism:
# Create a shared browser session
browser_session = BrowserSession(
headless=True,
user_data_dir=None, # Use temp directory
keep_alive=True,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None, # Use temp directory
keep_alive=True,
)
)
try:
@@ -400,9 +404,11 @@ class TestParallelism:
# Create shared browser session
shared_session = BrowserSession(
headless=True,
user_data_dir=None,
keep_alive=True,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None,
keep_alive=True,
)
)
try:
@@ -459,9 +465,11 @@ class TestParallelism:
# Create a session with keep_alive
session = BrowserSession(
headless=True,
user_data_dir=None,
keep_alive=True,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None,
keep_alive=True,
)
)
try:
@@ -514,13 +522,15 @@ class TestParallelism:
# Create session with existing playwright objects
browser_session = BrowserSession(
page=page,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None,
keep_alive=False,
),
agent_current_page=page,
browser_context=context,
browser=browser,
playwright=playwright,
headless=True,
user_data_dir=None,
keep_alive=False,
)
# Create mock LLM

View File

@@ -42,7 +42,12 @@ def test_replace_sensitive_data_with_missing_keys(registry, caplog):
# Set log level to capture warnings
import logging
caplog.set_level(logging.WARNING)
# Temporarily enable propagation for browser_use logger to capture logs
browser_use_logger = logging.getLogger('browser_use')
original_propagate = browser_use_logger.propagate
browser_use_logger.propagate = True
caplog.set_level(logging.WARNING, logger='browser_use.controller.registry.service')
# Create a simple Pydantic model with sensitive data placeholders
params = SensitiveParams(text='Please enter <secret>username</secret> and <secret>password</secret>')
@@ -83,13 +88,21 @@ def test_replace_sensitive_data_with_missing_keys(registry, caplog):
assert 'password' in caplog.text
caplog.clear()
# Restore original propagate setting
browser_use_logger.propagate = original_propagate
def test_simple_domain_specific_sensitive_data(registry, caplog):
"""Test the basic functionality of domain-specific sensitive data replacement"""
# Set log level to capture warnings
import logging
caplog.set_level(logging.WARNING)
# Temporarily enable propagation for browser_use logger to capture logs
browser_use_logger = logging.getLogger('browser_use')
original_propagate = browser_use_logger.propagate
browser_use_logger.propagate = True
caplog.set_level(logging.WARNING, logger='browser_use.controller.registry.service')
# Create a simple Pydantic model with sensitive data placeholders
params = SensitiveParams(text='Please enter <secret>username</secret> and <secret>password</secret>')
@@ -115,6 +128,9 @@ def test_simple_domain_specific_sensitive_data(registry, caplog):
assert 'password' in caplog.text # Only password should be logged as missing
caplog.clear()
# Restore original propagate setting
browser_use_logger.propagate = original_propagate
def test_match_url_with_domain_pattern():
"""Test that the domain pattern matching utility works correctly"""

View File

@@ -62,7 +62,7 @@ class TestBrowserSessionCookies:
@pytest.fixture
async def browser_profile_with_cookies(self, temp_cookies_file):
"""Create a BrowserProfile with cookies_file set."""
profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=str(temp_cookies_file))
profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=temp_cookies_file)
yield profile
@pytest.fixture
@@ -158,7 +158,7 @@ class TestBrowserSessionCookies:
async def test_nonexistent_cookies_file(self):
"""Test that browser starts normally when cookies_file doesn't exist."""
# Use a non-existent file path
profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file='/tmp/nonexistent_cookies.json')
profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=Path('/tmp/nonexistent_cookies.json'))
session = BrowserSession(browser_profile=profile)
# Should start without errors
@@ -176,7 +176,7 @@ class TestBrowserSessionCookies:
invalid_file = tmp_path / 'invalid_cookies.json'
invalid_file.write_text('not valid json')
profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=str(invalid_file))
profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=invalid_file)
session = BrowserSession(browser_profile=profile)
# Should start without errors (warning logged)
@@ -194,7 +194,7 @@ class TestBrowserSessionCookies:
profile = BrowserProfile(
headless=True,
user_data_dir=None,
cookies_file='./test_cookies.json', # Relative path
cookies_file=Path('./test_cookies.json'), # Relative path
downloads_path=browser_profile_with_cookies.downloads_path,
)

View File

@@ -6,6 +6,7 @@ import time
import pytest
from browser_use.browser import BrowserSession
from browser_use.browser.profile import BrowserProfile
@pytest.fixture(scope='function')
@@ -37,9 +38,11 @@ async def test_download_detection_timing(test_server, tmp_path):
# Test 1: With downloads_dir set (default behavior)
browser_with_downloads = BrowserSession(
headless=True,
downloads_dir=str(tmp_path / 'downloads'),
user_data_dir=None,
browser_profile=BrowserProfile(
headless=True,
downloads_path=str(tmp_path / 'downloads'),
user_data_dir=None,
)
)
await browser_with_downloads.start()
@@ -72,9 +75,11 @@ async def test_download_detection_timing(test_server, tmp_path):
# Test 2: With downloads_dir set to empty string (disables download detection)
browser_no_downloads = BrowserSession(
headless=True,
downloads_dir=None,
user_data_dir=None,
browser_profile=BrowserProfile(
headless=True,
downloads_path=None,
user_data_dir=None,
)
)
await browser_no_downloads.start()
@@ -124,9 +129,11 @@ async def test_actual_download_detection(test_server, tmp_path):
downloads_path.mkdir()
browser_session = BrowserSession(
headless=True,
downloads_path=str(downloads_path),
user_data_dir=None,
browser_profile=BrowserProfile(
headless=True,
downloads_path=str(downloads_path),
user_data_dir=None,
)
)
await browser_session.start()

View File

@@ -3,11 +3,10 @@ Systematic debugging of the selector map issue.
Test each assumption step by step to isolate the problem.
"""
import os
import pytest
from browser_use.browser import BrowserSession
from browser_use.browser.profile import BrowserProfile
from browser_use.controller.service import Controller
@@ -62,9 +61,10 @@ def httpserver(make_httpserver):
async def browser_session():
"""Create a real browser session for testing."""
session = BrowserSession(
executable_path=os.getenv('BROWSER_PATH'),
user_data_dir=None, # Use temporary profile
headless=True,
browser_profile=BrowserProfile(
user_data_dir=None, # Use temporary profile
headless=True,
)
)
async with session:
yield session
@@ -356,7 +356,9 @@ async def test_assumption_9_pydantic_private_attrs(browser_session, controller,
# Check the browser_session that comes out of the model
extracted_browser_session = special_params.browser_session
print(f'5. Extracted browser_session ID: {id(extracted_browser_session)}')
print(f'6. Extracted browser_session cache: {extracted_browser_session._cached_browser_state_summary is not None}')
print(
f'6. Extracted browser_session cache: {extracted_browser_session._cached_browser_state_summary is not None if extracted_browser_session else False}'
)
# Check if they're the same object
if id(browser_session) == id(extracted_browser_session):
@@ -367,10 +369,10 @@ async def test_assumption_9_pydantic_private_attrs(browser_session, controller,
# Check if private attributes were preserved
print(f'7. Original has _cached_browser_state_summary attr: {hasattr(browser_session, "_cached_browser_state_summary")}')
print(
f'8. Extracted has _cached_browser_state_summary attr: {hasattr(extracted_browser_session, "_cached_browser_state_summary")}'
f'8. Extracted has _cached_browser_state_summary attr: {hasattr(extracted_browser_session, "_cached_browser_state_summary") if extracted_browser_session else False}'
)
if hasattr(extracted_browser_session, '_cached_browser_state_summary'):
if extracted_browser_session and hasattr(extracted_browser_session, '_cached_browser_state_summary'):
print(f'9. Extracted _cached_browser_state_summary value: {extracted_browser_session._cached_browser_state_summary}')
@@ -401,8 +403,8 @@ async def test_assumption_7_cache_gets_cleared(browser_session, controller, http
from browser_use import ActionResult
cache_exists = browser_session._cached_browser_state_summary is not None
if cache_exists:
cache_size = len(browser_session._cached_browser_state_summary.selector_map)
if cache_exists and browser_session._cached_browser_state_summary:
cache_size = len(browser_session._cached_browser_state_summary.selector_map) # type: ignore
else:
cache_size = 0
return ActionResult(
@@ -415,8 +417,8 @@ async def test_assumption_7_cache_gets_cleared(browser_session, controller, http
from browser_use import ActionResult
cache_exists = browser_session._cached_browser_state_summary is not None
if cache_exists:
cache_size = len(browser_session._cached_browser_state_summary.selector_map)
if cache_exists and browser_session._cached_browser_state_summary:
cache_size = len(browser_session._cached_browser_state_summary.selector_map) # type: ignore
else:
cache_size = 0
return ActionResult(

View File

@@ -11,6 +11,7 @@ Tests cover common real-world file upload patterns:
import pytest
from pytest_httpserver import HTTPServer
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.session import BrowserSession
@@ -20,7 +21,7 @@ class TestBrowserSessionFileUploads:
@pytest.fixture
async def browser_session(self):
"""Create a BrowserSession instance for testing."""
session = BrowserSession(headless=True, user_data_dir=None, keep_alive=True)
session = BrowserSession(browser_profile=BrowserProfile(headless=True, user_data_dir=None, keep_alive=True))
yield session
await session.kill()

View File

@@ -10,7 +10,7 @@ import pytest
from browser_use import Agent, AgentHistoryList
from browser_use.browser import BrowserProfile, BrowserSession
from tests.ci.mocks import create_mock_llm
from tests.ci.conftest import create_mock_llm
@pytest.fixture
@@ -194,6 +194,7 @@ class TestAgentRecordings:
for gif in gif_files:
gif.unlink()
else: # custom_path
assert expected_gif_path is not None, 'expected_gif_path should be set for custom_path'
assert expected_gif_path.exists(), f'GIF was not created at {expected_gif_path}'
finally:
await browser_session.stop()
@@ -216,10 +217,10 @@ class TestBrowserProfileRecordings:
video_dir = test_dir / f'videos_{context_type}_{alias}'
user_data_dir = None if context_type == 'incognito' else str(test_dir / 'user_data')
# Create profile with dynamic alias
profile_kwargs = {'headless': True, 'disable_security': True, 'user_data_dir': user_data_dir, alias: str(video_dir)}
browser_session = BrowserSession(
browser_profile=BrowserProfile(
headless=True, disable_security=True, user_data_dir=user_data_dir, **{alias: str(video_dir)}
)
browser_profile=BrowserProfile(**profile_kwargs) # type: ignore
)
await browser_session.start()
try:
@@ -258,7 +259,10 @@ class TestBrowserProfileRecordings:
browser_session = BrowserSession(
browser_profile=BrowserProfile(
headless=True, disable_security=True, user_data_dir=user_data_dir, **{alias: str(har_path)}
headless=True,
disable_security=True,
user_data_dir=user_data_dir,
**{alias: str(har_path)}, # type: ignore
)
)
await browser_session.start()
@@ -307,7 +311,7 @@ class TestBrowserProfileRecordings:
if alias == 'trace_path':
browser_session.browser_profile.traces_dir = str(trace_dir)
else:
setattr(browser_session.browser_profile, alias, str(trace_dir))
setattr(browser_session.browser_profile, alias, str(trace_dir)) # type: ignore
await browser_session.start()
try:

View File

@@ -12,18 +12,19 @@ Tests cover:
import asyncio
import json
import logging
import tempfile
from pathlib import Path
import pytest
from browser_use.browser.profile import (
BROWSERUSE_CHROMIUM_USER_DATA_DIR,
BROWSERUSE_DEFAULT_CHANNEL,
BrowserChannel,
BrowserProfile,
)
from browser_use.browser.session import BrowserSession
from tests.ci.mocks import create_mock_llm
from browser_use.config import CONFIG
from tests.ci.conftest import create_mock_llm
# Set up test logging
logger = logging.getLogger('browser_session_start_tests')
@@ -485,21 +486,30 @@ class TestBrowserSessionStart:
await session.stop()
# Browser should still be connected
assert session.initialized is True
assert session.browser is not None
assert session.browser.is_connected()
assert session.browser_context and session.browser_context.pages[0]
finally:
await session.kill()
async def test_user_data_dir_not_allowed_to_corrupt_default_profile(self, caplog):
"""Test user_data_dir handling for different browser channels and version mismatches."""
import logging
# Temporarily enable propagation for browser_use logger to capture logs
browser_use_logger = logging.getLogger('browser_use')
original_propagate = browser_use_logger.propagate
browser_use_logger.propagate = True
caplog.set_level(logging.WARNING, logger='browser_use.utils')
# Test 1: Chromium with default user_data_dir and default channel should work fine
session = BrowserSession(
headless=True,
user_data_dir=BROWSERUSE_CHROMIUM_USER_DATA_DIR,
channel=BROWSERUSE_DEFAULT_CHANNEL, # chromium
keep_alive=False,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR,
channel=BROWSERUSE_DEFAULT_CHANNEL, # chromium
keep_alive=False,
),
)
try:
@@ -507,21 +517,21 @@ class TestBrowserSessionStart:
assert session.initialized is True
assert session.browser_context is not None
# Verify the user_data_dir wasn't changed
assert session.browser_profile.user_data_dir == BROWSERUSE_CHROMIUM_USER_DATA_DIR
assert session.browser_profile.user_data_dir == CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR
finally:
await session.kill()
# Test 2: Chrome with default user_data_dir should show warning and change dir
profile2 = BrowserProfile(
headless=True,
user_data_dir=BROWSERUSE_CHROMIUM_USER_DATA_DIR,
user_data_dir=CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR,
channel=BrowserChannel.CHROME,
keep_alive=False,
)
# The validator should have changed the user_data_dir
assert profile2.user_data_dir != BROWSERUSE_CHROMIUM_USER_DATA_DIR
assert profile2.user_data_dir == BROWSERUSE_CHROMIUM_USER_DATA_DIR.parent / 'default-chrome'
assert profile2.user_data_dir != CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR
assert profile2.user_data_dir == CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR.parent / 'default-chrome'
# Check warning was logged
warning_found = any(
@@ -529,6 +539,9 @@ class TestBrowserSessionStart:
)
assert warning_found, 'Expected warning about changing user_data_dir was not found'
# Restore original propagate setting
browser_use_logger.propagate = original_propagate
# only run if `/Applications/Brave Browser.app` is installed
@pytest.mark.skipif(
not Path('~/.config/browseruse/profiles/stealth').expanduser().exists(), reason='Brave Browser not installed'
@@ -546,9 +559,11 @@ class TestBrowserSessionStart:
# await brave_session.stop()
chromium_session = BrowserSession(
headless=True,
user_data_dir='~/.config/browseruse/profiles/stealth',
channel=BrowserChannel.CHROMIUM, # should crash when opened with chromium
browser_profile=BrowserProfile(
headless=True,
user_data_dir='~/.config/browseruse/profiles/stealth',
channel=BrowserChannel.CHROMIUM, # should crash when opened with chromium
),
)
# open chrome with corrupted user_data_dir
@@ -559,53 +574,6 @@ class TestBrowserSessionStart:
class TestBrowserSessionReusePatterns:
"""Tests for all browser re-use patterns documented in docs/customize/real-browser.mdx"""
@pytest.fixture(scope='module')
def mock_llm(self):
"""Mock LLM for agent tests"""
from unittest.mock import MagicMock
from langchain_core.language_models.chat_models import BaseChatModel
# Create a MagicMock that supports dictionary-style access
mock = MagicMock(spec=BaseChatModel)
# Skip verification by setting these attributes
mock._verified_api_keys = True
mock._verified_tool_calling_method = 'raw'
mock.model_name = 'mock-llm'
# Mock the invoke method to return a proper response
def mock_invoke(*args, **kwargs):
response = MagicMock()
# Return a valid JSON response that completes the task
response.content = """
{
"thinking": "null",
"evaluation_previous_goal": "Starting the task",
"memory": "Task started",
"next_goal": "Complete the task",
"action": [
{
"done": {
"text": "Task completed successfully",
"success": true
}
}
]
}
"""
return response
mock.invoke = mock_invoke
# Create an async version of the mock_invoke
async def mock_ainvoke(*args, **kwargs):
return mock_invoke(*args, **kwargs)
mock.ainvoke = mock_ainvoke
return mock
async def test_sequential_agents_same_profile_different_browser(self, mock_llm):
"""Test Sequential Agents, Same Profile, Different Browser pattern"""
from browser_use import Agent
@@ -652,9 +620,11 @@ class TestBrowserSessionReusePatterns:
# Create a reusable session with keep_alive
reused_session = BrowserSession(
user_data_dir=None, # Use temp dir for testing
headless=True,
keep_alive=True, # Don't close browser after agent.run()
browser_profile=BrowserProfile(
user_data_dir=None, # Use temp dir for testing
headless=True,
keep_alive=True, # Don't close browser after agent.run()
),
)
try:
@@ -695,7 +665,6 @@ class TestBrowserSessionReusePatterns:
async def test_parallel_agents_same_browser_multiple_tabs(self, httpserver):
"""Test Parallel Agents, Same Browser, Multiple Tabs pattern"""
import tempfile
from browser_use import Agent, BrowserSession
@@ -711,10 +680,12 @@ class TestBrowserSessionReusePatterns:
storage_state_path = Path(storage_state_path)
shared_browser = BrowserSession(
storage_state=storage_state_path,
user_data_dir=None,
keep_alive=True,
headless=True,
browser_profile=BrowserProfile(
storage_state=storage_state_path,
user_data_dir=None,
keep_alive=True,
headless=True,
),
)
try:
@@ -792,7 +763,7 @@ class TestBrowserSessionReusePatterns:
)
# Run all agents in parallel
results = await asyncio.gather(agent1.run(), agent2.run(), agent3.run())
_results = await asyncio.gather(agent1.run(), agent2.run(), agent3.run())
# Verify all agents used the same browser session (using __eq__ to check browser_pid, cdp_url, wss_url)
# Debug: print the browser sessions to see what's different
@@ -826,9 +797,11 @@ class TestBrowserSessionReusePatterns:
# Create a browser session and start it first
shared_browser = BrowserSession(
user_data_dir=None,
headless=True,
keep_alive=True, # Keep the browser alive for reuse
browser_profile=BrowserProfile(
user_data_dir=None,
headless=True,
keep_alive=True, # Keep the browser alive for reuse
),
)
try:
@@ -857,7 +830,7 @@ class TestBrowserSessionReusePatterns:
await page.goto(httpserver.url_for('/'), wait_until='domcontentloaded')
# Run agents in parallel (may interfere with each other)
results = await asyncio.gather(agent1.run(), agent2.run(), return_exceptions=True)
_results = await asyncio.gather(agent1.run(), agent2.run(), return_exceptions=True)
# Verify both agents used the same browser session
assert agent1.browser_session == agent2.browser_session
@@ -869,7 +842,6 @@ class TestBrowserSessionReusePatterns:
async def test_parallel_agents_same_profile_different_browsers(self, mock_llm):
"""Test Parallel Agents, Same Profile, Different Browsers pattern (recommended)"""
import tempfile
from browser_use import Agent
from browser_use.browser import BrowserProfile, BrowserSession
@@ -907,7 +879,7 @@ class TestBrowserSessionReusePatterns:
)
# Run agents in parallel
results = await asyncio.gather(agent1.run(), agent2.run())
_results = await asyncio.gather(agent1.run(), agent2.run())
# Verify different browser sessions were used
assert agent1.browser_session is not agent2.browser_session
@@ -933,3 +905,121 @@ class TestBrowserSessionReusePatterns:
await window1.kill()
await window2.kill()
auth_json_path.unlink(missing_ok=True)
async def test_browser_shutdown_isolated(self):
"""Test that browser shutdown doesnt affect other browser_sessions"""
from browser_use import BrowserSession
browser_session1 = BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=None,
headless=True,
keep_alive=True, # Keep the browser alive for reuse
),
)
browser_session2 = BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=None,
headless=True,
keep_alive=True, # Keep the browser alive for reuse
),
)
await browser_session1.start()
await browser_session2.start()
assert browser_session1.is_connected()
assert browser_session2.is_connected()
assert browser_session1.browser_context != browser_session2.browser_context
await browser_session1.create_new_tab('chrome://version')
await browser_session2.create_new_tab('chrome://settings')
await browser_session2.kill()
# ensure that the browser_session1 is still connected and unaffected by the kill of browser_session2
assert browser_session1.is_connected()
assert browser_session1.browser_context is not None
await browser_session1.create_new_tab('chrome://settings')
await browser_session1.browser_context.pages[0].evaluate('alert(1)')
await browser_session1.kill()
async def test_many_parallel_browser_sessions(self):
"""Test spawning 20 parallel browser_sessions with different settings and ensure they all work"""
from browser_use import BrowserSession
browser_sessions = []
for i in range(5):
browser_sessions.append(
BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=None,
headless=True,
keep_alive=True,
),
)
)
for i in range(5):
browser_sessions.append(
BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=Path(tempfile.mkdtemp(prefix=f'browseruse-tmp-{i}')),
headless=True,
keep_alive=True,
),
)
)
for i in range(5):
browser_sessions.append(
BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=None,
headless=True,
keep_alive=False,
),
)
)
for i in range(5):
browser_sessions.append(
BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=Path(tempfile.mkdtemp(prefix=f'browseruse-tmp-{i}')),
headless=True,
keep_alive=False,
),
)
)
await asyncio.gather(*[browser_session.start() for browser_session in browser_sessions])
# ensure all are connected and usable
new_tab_tasks = []
for browser_session in browser_sessions:
assert await browser_session.is_connected()
assert browser_session.browser_context is not None
new_tab_tasks.append(browser_session.create_new_tab('chrome://version'))
await asyncio.gather(*new_tab_tasks)
# kill every 3rd browser_session
kill_tasks = []
for i in range(0, len(browser_sessions), 3):
kill_tasks.append(browser_sessions[i].kill())
browser_sessions[i] = None
await asyncio.gather(*kill_tasks)
# ensure the remaining browser_sessions are still connected and usable
new_tab_tasks = []
screenshot_tasks = []
for browser_session in filter(bool, browser_sessions):
assert await browser_session.is_connected()
assert browser_session.browser_context is not None
new_tab_tasks.append(browser_session.create_new_tab('chrome://version'))
screenshot_tasks.append(browser_session.take_screenshot())
await asyncio.gather(*new_tab_tasks)
await asyncio.gather(*screenshot_tasks)
kill_tasks = []
for browser_session in filter(bool, browser_sessions):
kill_tasks.append(browser_session.kill())
await asyncio.gather(*kill_tasks)

View File

@@ -8,6 +8,7 @@ from pytest_httpserver import HTTPServer
load_dotenv()
from browser_use.agent.views import ActionModel
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.session import BrowserSession
from browser_use.controller.service import Controller
@@ -50,9 +51,11 @@ def base_url(http_server):
async def browser_session(base_url):
"""Create and provide a BrowserSession instance with a properly initialized tab."""
browser_session = BrowserSession(
user_data_dir=None,
headless=True,
keep_alive=True,
browser_profile=BrowserProfile(
user_data_dir=None,
headless=True,
keep_alive=True,
)
)
await browser_session.start()
@@ -110,8 +113,9 @@ class TestTabManagement:
browser_session.agent_current_page = None
# close all existing tabs
for page in browser_session.browser_context.pages:
await page.close()
if browser_session.browser_context:
for page in browser_session.browser_context.pages: # type: ignore
await page.close()
await asyncio.sleep(0.5)
@@ -327,22 +331,22 @@ class TestTabManagement:
assert browser_session.browser_context is not None
assert browser_session.browser_context != original_context
assert browser_session.initialized is True
assert browser_session.is_connected() is True
assert (await browser_session.is_connected()) is True
async def test_concurrent_context_access_during_closure(self, browser_session):
"""Test concurrent access to browser context during closure"""
# logger.info('Testing concurrent context access during closure')
await browser_session.start()
assert browser_session.is_connected() is True
assert (await browser_session.is_connected()) is True
# Create a barrier to synchronize operations
barrier = asyncio.Barrier(3)
async def close_context():
await barrier.wait()
await browser_session.browser_context.browser.close()
assert browser_session.is_connected() is False
await browser_session.browser_context.close()
assert (await browser_session.is_connected()) is False
return 'closed'
async def access_pages():
@@ -356,14 +360,14 @@ class TestTabManagement:
async def check_connection():
await barrier.wait()
await asyncio.sleep(0.01) # Small delay to let close start
connected = browser_session.is_connected()
connected = await browser_session.is_connected()
return f'connected: {connected}'
# Run all operations concurrently
results = await asyncio.gather(close_context(), access_pages(), check_connection(), return_exceptions=True)
results = list(await asyncio.gather(close_context(), access_pages(), check_connection(), return_exceptions=True))
# All operations should complete without crashes
assert all(not isinstance(r, Exception) for r in results)
assert results and all(not isinstance(r, Exception) for r in results)
assert 'closed' in results
await browser_session.kill()

View File

@@ -1,14 +1,17 @@
import pytest
from browser_use.browser import BrowserSession
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.types import async_playwright
async def test_connection_via_cdp():
browser_session = BrowserSession(
cdp_url='http://localhost:9898',
headless=True,
keep_alive=True,
browser_profile=BrowserProfile(
headless=True,
keep_alive=True,
),
)
with pytest.raises(Exception) as e:
await browser_session.start()

View File

@@ -1,5 +1,5 @@
from browser_use.browser import BrowserSession
from browser_use.browser.profile import ProxySettings
from browser_use.browser.profile import BrowserProfile, ProxySettings
async def test_proxy_settings_pydantic_model():
@@ -33,11 +33,13 @@ async def test_window_size_with_real_browser():
"""
# Create browser profile with headless mode and specific dimensions
browser_session = BrowserSession(
user_data_dir=None,
headless=True, # window size gets converted to viewport size in headless mode
window_size={'width': 999, 'height': 888},
maximum_wait_page_load_time=2.0,
minimum_wait_page_load_time=0.2,
browser_profile=BrowserProfile(
user_data_dir=None,
headless=True, # window size gets converted to viewport size in headless mode
window_size={'width': 999, 'height': 888},
maximum_wait_page_load_time=2.0,
minimum_wait_page_load_time=0.2,
)
)
await browser_session.start()
page = await browser_session.get_current_page()
@@ -117,9 +119,11 @@ async def test_proxy_with_real_browser():
# Create browser session
browser_session = BrowserSession(
headless=True,
proxy=proxy_settings,
user_data_dir=None,
browser_profile=BrowserProfile(
headless=True,
proxy=proxy_settings,
user_data_dir=None,
)
)
await browser_session.start()
# Success - the browser was initialized with our proxy settings

120
tests/ci/test_config.py Normal file
View File

@@ -0,0 +1,120 @@
"""Tests for lazy loading configuration system."""
import os
from browser_use.config import CONFIG
class TestLazyConfig:
"""Test lazy loading of environment variables through CONFIG object."""
def test_config_reads_env_vars_lazily(self):
"""Test that CONFIG reads environment variables each time they're accessed."""
# Set an env var
original_value = os.environ.get('BROWSER_USE_LOGGING_LEVEL', '')
try:
os.environ['BROWSER_USE_LOGGING_LEVEL'] = 'debug'
assert CONFIG.BROWSER_USE_LOGGING_LEVEL == 'debug'
# Change the env var
os.environ['BROWSER_USE_LOGGING_LEVEL'] = 'info'
assert CONFIG.BROWSER_USE_LOGGING_LEVEL == 'info'
# Delete the env var to test default
del os.environ['BROWSER_USE_LOGGING_LEVEL']
assert CONFIG.BROWSER_USE_LOGGING_LEVEL == 'info' # default value
finally:
# Restore original value
if original_value:
os.environ['BROWSER_USE_LOGGING_LEVEL'] = original_value
else:
os.environ.pop('BROWSER_USE_LOGGING_LEVEL', None)
def test_boolean_env_vars(self):
"""Test boolean environment variables are parsed correctly."""
original_value = os.environ.get('ANONYMIZED_TELEMETRY', '')
try:
# Test true values
for true_val in ['true', 'True', 'TRUE', 'yes', 'Yes', '1']:
os.environ['ANONYMIZED_TELEMETRY'] = true_val
assert CONFIG.ANONYMIZED_TELEMETRY is True, f'Failed for value: {true_val}'
# Test false values
for false_val in ['false', 'False', 'FALSE', 'no', 'No', '0']:
os.environ['ANONYMIZED_TELEMETRY'] = false_val
assert CONFIG.ANONYMIZED_TELEMETRY is False, f'Failed for value: {false_val}'
finally:
if original_value:
os.environ['ANONYMIZED_TELEMETRY'] = original_value
else:
os.environ.pop('ANONYMIZED_TELEMETRY', None)
def test_api_keys_lazy_loading(self):
"""Test API keys are loaded lazily."""
original_value = os.environ.get('OPENAI_API_KEY', '')
try:
# Test empty default
os.environ.pop('OPENAI_API_KEY', None)
assert CONFIG.OPENAI_API_KEY == ''
# Set a value
os.environ['OPENAI_API_KEY'] = 'test-key-123'
assert CONFIG.OPENAI_API_KEY == 'test-key-123'
# Change the value
os.environ['OPENAI_API_KEY'] = 'new-key-456'
assert CONFIG.OPENAI_API_KEY == 'new-key-456'
finally:
if original_value:
os.environ['OPENAI_API_KEY'] = original_value
else:
os.environ.pop('OPENAI_API_KEY', None)
def test_path_configuration(self):
"""Test path configuration variables."""
original_value = os.environ.get('XDG_CACHE_HOME', '')
try:
# Test custom path
test_path = '/tmp/test-cache'
os.environ['XDG_CACHE_HOME'] = test_path
# Use Path().resolve() to handle symlinks (e.g., /tmp -> /private/tmp on macOS)
from pathlib import Path
assert CONFIG.XDG_CACHE_HOME == Path(test_path).resolve()
# Test default path expansion
os.environ.pop('XDG_CACHE_HOME', None)
assert '/.cache' in str(CONFIG.XDG_CACHE_HOME)
finally:
if original_value:
os.environ['XDG_CACHE_HOME'] = original_value
else:
os.environ.pop('XDG_CACHE_HOME', None)
def test_cloud_sync_inherits_telemetry(self):
"""Test BROWSER_USE_CLOUD_SYNC inherits from ANONYMIZED_TELEMETRY when not set."""
telemetry_original = os.environ.get('ANONYMIZED_TELEMETRY', '')
sync_original = os.environ.get('BROWSER_USE_CLOUD_SYNC', '')
try:
# When BROWSER_USE_CLOUD_SYNC is not set, it should inherit from ANONYMIZED_TELEMETRY
os.environ['ANONYMIZED_TELEMETRY'] = 'true'
os.environ.pop('BROWSER_USE_CLOUD_SYNC', None)
assert CONFIG.BROWSER_USE_CLOUD_SYNC is True
os.environ['ANONYMIZED_TELEMETRY'] = 'false'
os.environ.pop('BROWSER_USE_CLOUD_SYNC', None)
assert CONFIG.BROWSER_USE_CLOUD_SYNC is False
# When explicitly set, it should use its own value
os.environ['ANONYMIZED_TELEMETRY'] = 'false'
os.environ['BROWSER_USE_CLOUD_SYNC'] = 'true'
assert CONFIG.BROWSER_USE_CLOUD_SYNC is True
finally:
if telemetry_original:
os.environ['ANONYMIZED_TELEMETRY'] = telemetry_original
else:
os.environ.pop('ANONYMIZED_TELEMETRY', None)
if sync_original:
os.environ['BROWSER_USE_CLOUD_SYNC'] = sync_original
else:
os.environ.pop('BROWSER_USE_CLOUD_SYNC', None)

View File

@@ -8,6 +8,7 @@ from pytest_httpserver import HTTPServer
from browser_use.agent.views import ActionModel, ActionResult
from browser_use.browser import BrowserSession
from browser_use.browser.profile import BrowserProfile
from browser_use.controller.service import Controller
from browser_use.controller.views import (
ClickElementAction,
@@ -79,9 +80,10 @@ def base_url(http_server):
async def browser_session():
"""Create and provide a Browser instance with security disabled."""
browser_session = BrowserSession(
# browser_profile=BrowserProfile(),
headless=True,
user_data_dir=None,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None,
)
)
await browser_session.start()
yield browser_session
@@ -113,6 +115,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert f'Navigated to {base_url}/page1' in result.extracted_content
# Verify the current page URL
@@ -140,6 +143,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Scrolled down' in result.extracted_content
# Create scroll up action
@@ -153,6 +157,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Scrolled up' in result.extracted_content
async def test_registry_actions(self, controller, browser_session):
@@ -208,6 +213,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Custom action executed with: test_value on' in result.extracted_content
assert f'{base_url}/page1' in result.extracted_content
@@ -262,6 +268,7 @@ class TestControllerIntegration:
result = await controller.act(InputTextActionModel(**input_action), browser_session)
# If successful, verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Input' in result.extracted_content
except Exception as e:
# If it fails due to DOM issues, that's expected in a test environment
@@ -353,6 +360,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Navigated back' in result.extracted_content
# Add another delay to allow the navigation to complete
@@ -475,6 +483,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Searched for "Python web automation" in Google' in result.extracted_content
# For our test purposes, we just verify we're on some URL
@@ -508,6 +517,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert success_done_message in result.extracted_content
assert result.success is True
assert result.is_done is True
@@ -523,6 +533,7 @@ class TestControllerIntegration:
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert failed_done_message in result.extracted_content
assert result.success is False
assert result.is_done is True
@@ -718,6 +729,10 @@ class TestControllerIntegration:
drag_action = {
'drag_drop': DragDropAction(
# Use the coordinate-based approach
element_source=None,
element_target=None,
element_source_offset=None,
element_target_offset=None,
coord_source_x=element_info['source']['x'],
coord_source_y=element_info['source']['y'],
coord_target_x=element_info['target']['x'],
@@ -736,6 +751,7 @@ class TestControllerIntegration:
# Step 5: Verify the controller action result
assert result.error is None, f'Drag operation failed with error: {result.error}'
assert result.is_done is False
assert result.extracted_content is not None
assert '🖱️ Dragged from' in result.extracted_content
# Step 6: Verify the element was moved by checking its new parent
@@ -827,7 +843,8 @@ class TestControllerIntegration:
# Verify navigation result
assert isinstance(goto_result, ActionResult)
assert f'Navigated to {base_url}/keyboard' in goto_result.extracted_content
assert goto_result.extracted_content is not None
assert goto_result.extracted_content is not None and f'Navigated to {base_url}/keyboard' in goto_result.extracted_content
assert goto_result.error is None
assert goto_result.is_done is False
@@ -853,7 +870,8 @@ class TestControllerIntegration:
# Verify Tab action result
assert isinstance(tab_result, ActionResult)
assert 'Sent keys: Tab' in tab_result.extracted_content
assert tab_result.extracted_content is not None
assert tab_result.extracted_content is not None and 'Sent keys: Tab' in tab_result.extracted_content
assert tab_result.error is None
assert tab_result.is_done is False
@@ -873,7 +891,8 @@ class TestControllerIntegration:
# Verify typing action result
assert isinstance(type_result, ActionResult)
assert f'Sent keys: {test_text}' in type_result.extracted_content
assert type_result.extracted_content is not None
assert type_result.extracted_content is not None and f'Sent keys: {test_text}' in type_result.extracted_content
assert type_result.error is None
assert type_result.is_done is False
@@ -894,7 +913,11 @@ class TestControllerIntegration:
# Verify select all action result
assert isinstance(select_all_result, ActionResult)
assert 'Sent keys: ControlOrMeta+a' in select_all_result.extracted_content
assert select_all_result.extracted_content is not None
assert (
select_all_result.extracted_content is not None
and 'Sent keys: ControlOrMeta+a' in select_all_result.extracted_content
)
assert select_all_result.error is None
# Verify selection length matches the text length
@@ -915,7 +938,8 @@ class TestControllerIntegration:
# Verify second Tab action result
assert isinstance(tab_result2, ActionResult)
assert 'Sent keys: Tab' in tab_result2.extracted_content
assert tab_result2.extracted_content is not None
assert tab_result2.extracted_content is not None and 'Sent keys: Tab' in tab_result2.extracted_content
assert tab_result2.error is None
# Verify we moved to the textarea
@@ -933,7 +957,10 @@ class TestControllerIntegration:
# Verify textarea typing action result
assert isinstance(textarea_result, ActionResult)
assert f'Sent keys: {textarea_text}' in textarea_result.extracted_content
assert textarea_result.extracted_content is not None
assert (
textarea_result.extracted_content is not None and f'Sent keys: {textarea_text}' in textarea_result.extracted_content
)
assert textarea_result.error is None
assert textarea_result.is_done is False
@@ -1038,6 +1065,7 @@ class TestControllerIntegration:
assert isinstance(result, ActionResult)
# Core logic validation: Verify all options are returned
assert result.extracted_content is not None
for option in expected_options[1:]: # Skip the placeholder option
assert option['text'] in result.extracted_content, f"Option '{option['text']}' not found in result content"
@@ -1135,6 +1163,7 @@ class TestControllerIntegration:
assert isinstance(result, ActionResult)
# Core logic validation: Verify selection was successful
assert result.extracted_content is not None
assert 'selected option' in result.extracted_content.lower()
assert 'Second Option' in result.extracted_content
@@ -1223,26 +1252,32 @@ class TestControllerIntegration:
expected_result_text = 'Button 1 clicked'
# Verify the button text matches what we expect
assert expected_button_text in button_text, f"Expected button text '{expected_button_text}' not found in '{button_text}'"
assert button_text is not None and expected_button_text in button_text, (
f"Expected button text '{expected_button_text}' not found in '{button_text}'"
)
# Create a model for the click_element_by_index action
class ClickElementActionModel(ActionModel):
click_element_by_index: ClickElementAction | None = None
# Execute the action with the button index
result = await controller.act(ClickElementActionModel(click_element_by_index={'index': button_index}), browser_session)
result = await controller.act(
ClickElementActionModel(click_element_by_index=ClickElementAction(index=button_index)), browser_session
)
# Verify the result structure
assert isinstance(result, ActionResult), 'Result should be an ActionResult instance'
assert result.error is None, f'Expected no error but got: {result.error}'
# Core logic validation: Verify click was successful
assert result.extracted_content is not None
assert f'Clicked button with index {button_index}' in result.extracted_content, (
f'Expected click confirmation in result content, got: {result.extracted_content}'
)
assert button_text in result.extracted_content, (
f"Button text '{button_text}' not found in result content: {result.extracted_content}"
)
if button_text:
assert result.extracted_content is not None and button_text in result.extracted_content, (
f"Button text '{button_text}' not found in result content: {result.extracted_content}"
)
# Verify the click actually had an effect on the page
result_text = await page.evaluate("document.getElementById('result').textContent")

View File

@@ -20,6 +20,7 @@ from pytest_httpserver.httpserver import HandlerType
from browser_use.agent.views import ActionResult
from browser_use.browser import BrowserSession
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.types import Page
from browser_use.controller.registry.service import Registry
from browser_use.controller.registry.views import ActionModel as BaseActionModel
@@ -29,7 +30,7 @@ from browser_use.controller.views import (
NoParamsAction,
SearchGoogleAction,
)
from tests.ci.mocks import create_mock_llm
from tests.ci.conftest import create_mock_llm
# Configure logging
logging.basicConfig(level=logging.DEBUG)
@@ -96,8 +97,10 @@ def registry():
async def browser_session(base_url):
"""Create a real BrowserSession for testing"""
browser_session = BrowserSession(
headless=True,
user_data_dir=None,
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None,
)
)
await browser_session.start()
await browser_session.create_new_tab(f'{base_url}/test')
@@ -119,6 +122,7 @@ class TestActionRegistryParameterPatterns:
result = await registry.execute_action('simple_action', {'text': 'hello', 'number': 42})
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Text: hello, Number: 42' in result.extracted_content
async def test_individual_parameters_with_browser(self, registry, browser_session, base_url):
@@ -136,6 +140,7 @@ class TestActionRegistryParameterPatterns:
result = await registry.execute_action('action_with_browser', {'text': 'hello'}, browser_session=browser_session)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Text: hello, URL:' in result.extracted_content
assert base_url in result.extracted_content
@@ -154,6 +159,7 @@ class TestActionRegistryParameterPatterns:
result = await registry.execute_action('action_with_page', {'text': 'hello'}, browser_session=browser_session)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Text: hello, Page Title: Test Page' in result.extracted_content
async def test_pydantic_model_with_page_parameter(self, registry, browser_session, base_url):
@@ -173,6 +179,7 @@ class TestActionRegistryParameterPatterns:
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Text: test, Number: 100, Page Title: Test Page' in result.extracted_content
async def test_pydantic_model_parameters(self, registry, browser_session, base_url):
@@ -194,6 +201,7 @@ class TestActionRegistryParameterPatterns:
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Text: test, Number: 100, Flag: True' in result.extracted_content
assert base_url in result.extracted_content
@@ -229,6 +237,7 @@ class TestActionRegistryParameterPatterns:
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Text: hello' in result.extracted_content
assert base_url in result.extracted_content
# The mock LLM returns a JSON response
@@ -248,6 +257,7 @@ class TestActionRegistryParameterPatterns:
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'No params action executed on' in result.extracted_content
assert '/test' in result.extracted_content
@@ -266,11 +276,13 @@ class TestActionRegistryParameterPatterns:
# Test legacy browser parameter
result1 = await registry.execute_action('legacy_browser_action', {'text': 'test1'}, browser_session=browser_session)
assert result1.extracted_content is not None
assert 'Legacy browser: test1, URL:' in result1.extracted_content
assert '/test' in result1.extracted_content
# Test legacy browser_context parameter
result2 = await registry.execute_action('legacy_context_action', {'text': 'test2'}, browser_session=browser_session)
assert result2.extracted_content is not None
assert 'Legacy context: test2, URL:' in result2.extracted_content
assert '/test' in result2.extracted_content
@@ -296,11 +308,13 @@ class TestActionRegistryParameterPatterns:
# Test direct page parameter
result1 = await registry.execute_action('direct_page_action', {'text': 'optimized'}, browser_session=browser_session)
assert result1.extracted_content is not None
assert 'Direct page: optimized, URL:' in result1.extracted_content
assert '/test' in result1.extracted_content
# Test browser_session parameter (should still work)
result2 = await registry.execute_action('browser_session_action', {'text': 'legacy'}, browser_session=browser_session)
assert result2.extracted_content is not None
assert 'Browser session: legacy, URL:' in result2.extracted_content
assert '/test' in result2.extracted_content
@@ -313,6 +327,7 @@ class TestActionRegistryParameterPatterns:
return ActionResult(extracted_content=f'Pydantic page: {params.message}, URL: {page.url}')
result3 = await registry.execute_action('pydantic_page_action', {'message': 'pydantic'}, browser_session=browser_session)
assert result3.extracted_content is not None
assert 'Pydantic page: pydantic, URL:' in result3.extracted_content
assert '/test' in result3.extracted_content
@@ -346,6 +361,7 @@ class TestActionToActionCalling:
result = await registry.execute_action('calling_action', {'message': 'test'}, browser_session=browser_session)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Called result: First: Helper processed: test on' in result.extracted_content
assert '/test' in result.extracted_content
@@ -373,13 +389,14 @@ class TestActionToActionCalling:
# Get the action's param model to call it properly
action = registry.registry.actions['select_cell_or_range_fixed']
params = action.param_model(cell_or_range=range_name)
await select_cell_or_range_fixed(params=params, browser_session=browser_session)
await select_cell_or_range_fixed(cell_or_range=range_name, browser_session=browser_session)
return ActionResult(extracted_content=f'Updated range {range_name} with {new_contents}')
# Test the fixed version (should work)
result_fixed = await registry.execute_action(
'select_cell_or_range_fixed', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
)
assert result_fixed.extracted_content is not None
assert 'Selected cell A1:F100 on' in result_fixed.extracted_content
assert '/test' in result_fixed.extracted_content
@@ -387,6 +404,7 @@ class TestActionToActionCalling:
result_chain = await registry.execute_action(
'update_range_contents', {'range_name': 'B2:D4', 'new_contents': 'test data'}, browser_session=browser_session
)
assert result_chain.extracted_content is not None
assert 'Updated range B2:D4 with test data' in result_chain.extracted_content
# Test the problematic version (should work with enhanced registry)
@@ -394,6 +412,7 @@ class TestActionToActionCalling:
'select_cell_or_range', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
)
# With the enhanced registry, this should succeed
assert result_problematic.extracted_content is not None
assert 'Selected cell A1:F100 on' in result_problematic.extracted_content
assert '/test' in result_problematic.extracted_content
@@ -425,6 +444,7 @@ class TestActionToActionCalling:
result = await registry.execute_action('top_action', {'original': 'test'}, browser_session=browser_session)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Top: Middle: Base: processed-enhanced-test on' in result.extracted_content
assert '/test' in result.extracted_content
@@ -444,11 +464,12 @@ class TestRegistryEdgeCases:
with pytest.raises(
TypeError, match='test_action\\(\\) does not accept positional arguments, only keyword arguments are allowed'
):
await test_action(browser_session, 'A1:B2')
await test_action('A1:B2', browser_session)
# Test that calling with keyword arguments works
result = await test_action(browser_session=browser_session, cell_or_range='A1:B2')
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Selected cell A1:B2 on' in result.extracted_content
async def test_missing_required_browser_session(self, registry):
@@ -520,6 +541,7 @@ class TestRegistryEdgeCases:
result = await registry.execute_action('sync_action', {'text': 'test'}, browser_session=browser_session)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Sync: test' in result.extracted_content
async def test_excluded_actions(self, browser_session):
@@ -545,6 +567,7 @@ class TestRegistryEdgeCases:
# Included action should work
result = await registry_with_exclusions.execute_action('included_action', {'text': 'test'})
assert result.extracted_content is not None
assert 'Should execute: test' in result.extracted_content
@@ -568,14 +591,17 @@ class TestExistingControllerActions:
# Test SearchGoogleAction
result1 = await registry.execute_action('test_search', {'query': 'python testing'}, browser_session=browser_session)
assert result1.extracted_content is not None
assert 'Searched for: python testing' in result1.extracted_content
# Test ClickElementAction
result2 = await registry.execute_action('test_click', {'index': 42}, browser_session=browser_session)
assert result2.extracted_content is not None
assert 'Clicked element: 42' in result2.extracted_content
# Test InputTextAction
result3 = await registry.execute_action('test_input', {'index': 5, 'text': 'test input'}, browser_session=browser_session)
assert result3.extracted_content is not None
assert 'Input text: test input at index: 5' in result3.extracted_content
async def test_pydantic_vs_individual_params_consistency(self, registry, browser_session):
@@ -603,7 +629,9 @@ class TestExistingControllerActions:
result2 = await registry.execute_action('pydantic_params_action', test_data, browser_session=browser_session)
# Both should extract the same content (just different prefixes)
assert result1.extracted_content is not None
assert 'hello-42' in result1.extracted_content
assert result2.extracted_content is not None
assert 'hello-42' in result2.extracted_content
assert 'Individual:' in result1.extracted_content
assert 'Pydantic:' in result2.extracted_content
@@ -683,7 +711,7 @@ class TestType2Pattern:
registry = Registry()
@registry.action('Scroll page')
async def scroll_page(direction: str = 'down', amount: int = 100, browser_session: BrowserSession = None):
async def scroll_page(direction: str = 'down', amount: int = 100, browser_session: BrowserSession = None): # type: ignore
return ActionResult(extracted_content=f'Scrolled {direction} by {amount}')
action = registry.registry.actions['scroll_page']
@@ -847,7 +875,11 @@ class TestParamsModelGeneration:
@registry.action('Complex action')
async def complex_action(
query: str, max_results: int, include_images: bool = True, page: Page = None, browser_session: BrowserSession = None
query: str,
max_results: int,
include_images: bool = True,
page: Page = None, # type: ignore
browser_session: BrowserSession = None, # type: ignore
):
return ActionResult()
@@ -869,7 +901,11 @@ class TestParamsModelGeneration:
@registry.action('Typed action')
async def typed_action(
count: int, rate: float, enabled: bool, name: str | None = None, browser_session: BrowserSession = None
count: int,
rate: float,
enabled: bool,
name: str | None = None,
browser_session: BrowserSession = None, # type: ignore
):
return ActionResult()
@@ -938,7 +974,7 @@ class TestParameterOrdering:
second: int,
page: Page,
third: bool = True,
page_extraction_llm: BaseChatModel = None,
page_extraction_llm: BaseChatModel = None, # type: ignore
):
return ActionResult()
@@ -1006,6 +1042,7 @@ class TestParameterOrdering:
# Should retry once and succeed
result = await registry.execute_action('flaky_action', {'value': 'test'}, browser_session=browser_session)
assert result.extracted_content is not None
assert 'Success on attempt 2' in result.extracted_content
assert call_count == 2
@@ -1070,7 +1107,7 @@ class TestParamsModelArgsAndKwargs:
# Model that includes browser_session
class ModelWithBrowser(ActionModel):
value: str = Field(description='Test value')
browser_session: BrowserSession = None
browser_session: BrowserSession = None # type: ignore
# Create a custom param model for select_cell_or_range
class CellRangeParams(ActionModel):
@@ -1136,7 +1173,9 @@ class TestParamsModelArgsAndKwargs:
# logger.info('\n--- Testing original problematic version ---')
try:
result1 = await registry.execute_action(
'select_cell_or_range', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
'select_cell_or_range',
{'cell_or_range': 'A1:F100'},
browser_session=browser_session, # type: ignore
)
# logger.info(f'Success! Result: {result1}')
except Exception as e:
@@ -1146,7 +1185,9 @@ class TestParamsModelArgsAndKwargs:
# logger.info('\n--- Testing fixed version (positional args) ---')
try:
result2 = await registry.execute_action(
'select_cell_or_range_fixed', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
'select_cell_or_range_fixed',
{'cell_or_range': 'A1:F100'},
browser_session=browser_session, # type: ignore
)
# logger.info(f'Success! Result: {result2}')
except Exception as e:
@@ -1156,7 +1197,9 @@ class TestParamsModelArgsAndKwargs:
# logger.info('\n--- Testing kwargs simulation version ---')
try:
result3 = await registry.execute_action(
'select_with_kwargs', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
'select_with_kwargs',
{'cell_or_range': 'A1:F100'},
browser_session=browser_session, # type: ignore
)
# logger.info(f'Success! Result: {result3}')
except Exception as e:

View File

@@ -5,20 +5,18 @@ Tests the most critical event flows without excessive duplication.
"""
import base64
import json
import os
from unittest.mock import AsyncMock, MagicMock, Mock, patch
from unittest.mock import patch
from uuid import UUID
import pytest
from dotenv import load_dotenv
# Load environment variables before any imports
load_dotenv()
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage
from pytest_httpserver import HTTPServer
from bubus import BaseEvent
from browser_use import Agent
from browser_use.agent.cloud_events import (
MAX_TASK_LENGTH,
CreateAgentOutputFileEvent,
@@ -27,126 +25,14 @@ from browser_use.agent.cloud_events import (
CreateAgentTaskEvent,
UpdateAgentTaskEvent,
)
# Skip LLM API key verification for tests
os.environ['SKIP_LLM_API_KEY_VERIFICATION'] = 'true'
from bubus import BaseEvent
from browser_use import Agent
from browser_use.browser import BrowserSession
from browser_use.sync.service import CloudSync
from tests.ci.mocks import create_mock_llm
@pytest.fixture
async def browser_session():
"""Create a real browser session for testing"""
session = BrowserSession(
headless=True,
user_data_dir=None, # Use temporary directory
)
yield session
await session.stop()
@pytest.fixture
def mock_llm():
"""Create a mock LLM that immediately returns done action"""
llm = MagicMock(spec=BaseChatModel)
# Create the JSON response that the agent would parse
json_response = {
'thinking': 'null',
'evaluation_previous_goal': 'Starting task',
'memory': 'New task to complete',
'next_goal': 'Complete the test task',
'action': [{'done': {'success': True, 'text': 'Test completed successfully'}}],
}
# Create a mock response with the JSON
mock_response = AIMessage(content=json.dumps(json_response))
# Make the LLM return our mock response
llm.invoke = lambda *args, **kwargs: mock_response
llm.ainvoke = AsyncMock(return_value=mock_response)
# Mock the with_structured_output method to return parsed objects
structured_llm = MagicMock()
async def mock_structured_ainvoke(*args, **kwargs):
# The agent will create its own AgentOutput and ActionModel classes
# We return the raw response and let the agent parse it
return {
'raw': mock_response,
'parsed': None, # Let the agent parse it from the raw JSON
}
structured_llm.ainvoke = AsyncMock(side_effect=mock_structured_ainvoke)
llm.with_structured_output = lambda *args, **kwargs: structured_llm
# Set attributes that agent checks
llm.model_name = 'gpt-4o'
llm._verified_api_keys = True
llm._verified_tool_calling_method = 'function_calling'
return llm
@pytest.fixture
def event_collector():
"""Collect all events emitted during tests"""
events = []
event_order = []
class EventCollector:
def __init__(self):
self.events = events
self.event_order = event_order
async def collect_event(self, event: BaseEvent):
self.events.append(event)
self.event_order.append(event.event_type)
return 'collected'
def get_events_by_type(self, event_type: str) -> list[BaseEvent]:
return [e for e in self.events if e.event_type == event_type]
def clear(self):
self.events.clear()
self.event_order.clear()
return EventCollector()
@pytest.fixture
def mock_cloud_sync():
"""Create mocked cloud sync service."""
sync = Mock(spec=CloudSync)
sync.send_event = AsyncMock()
sync.authenticate = AsyncMock(return_value=True)
sync._authenticated = True
sync.handle_event = AsyncMock()
return sync
@pytest.fixture
def agent_with_cloud(browser_session, mock_cloud_sync):
"""Create agent with cloud sync enabled."""
with patch('browser_use.sync.CloudSync', return_value=mock_cloud_sync):
with patch.dict(os.environ, {'BROWSERUSE_CLOUD_SYNC': 'true'}):
agent = Agent(
task='Test task',
llm=create_mock_llm(),
browser_session=browser_session,
)
return agent
from tests.ci.conftest import create_mock_llm
class TestAgentEventLifecycle:
"""Test critical agent event flows with minimal duplication"""
async def test_agent_lifecycle_events(self, mock_llm, browser_session, event_collector, httpserver: HTTPServer):
@pytest.mark.usefixtures('mock_llm', 'browser_session', 'event_collector', 'httpserver')
async def test_agent_lifecycle_events(self, mock_llm, browser_session, event_collector, httpserver):
"""Test that all events are emitted in the correct order during agent lifecycle"""
# Setup a test page
@@ -155,23 +41,19 @@ class TestAgentEventLifecycle:
# Navigate to test page
await browser_session.navigate(httpserver.url_for('/'))
# Patch environment variables to use localhost for CloudSync
with patch.dict(
os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000', 'BROWSER_USE_CLOUD_UI_URL': 'http://localhost:3000'}
):
# Create agent
agent = Agent(
task='Test task',
llm=mock_llm,
browser_session=browser_session,
generate_gif=False, # Don't generate GIF for faster test
)
# Create agent (environment already set up by conftest.py)
agent = Agent(
task='Test task',
llm=mock_llm,
browser_session=browser_session,
generate_gif=False, # Don't generate GIF for faster test
)
# Subscribe to all events
agent.eventbus.on('*', event_collector.collect_event)
# Subscribe to all events
agent.eventbus.on('*', event_collector.collect_event)
# Run the agent
history = await agent.run(max_steps=5)
# Run the agent
history = await agent.run(max_steps=5)
# Verify we got a successful completion
assert history.is_done()
@@ -213,30 +95,28 @@ class TestAgentEventLifecycle:
assert update_event.id == task_event.id
assert update_event.done_output is not None
async def test_agent_with_gif_generation(self, mock_llm, browser_session, event_collector, httpserver: HTTPServer):
@pytest.mark.usefixtures('mock_llm', 'browser_session', 'event_collector', 'httpserver')
async def test_agent_with_gif_generation(self, mock_llm, browser_session, cloud_sync, event_collector, httpserver):
"""Test that GIF generation triggers CreateAgentOutputFileEvent"""
# Setup a test page
httpserver.expect_request('/').respond_with_data('<html><body><h1>GIF Test</h1></body></html>', content_type='text/html')
await browser_session.navigate(httpserver.url_for('/'))
# Patch environment variables to use localhost for CloudSync
with patch.dict(
os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000', 'BROWSER_USE_CLOUD_UI_URL': 'http://localhost:3000'}
):
# Create agent with GIF generation
agent = Agent(
task='Test task with GIF',
llm=mock_llm,
browser_session=browser_session,
generate_gif=True, # Enable GIF generation
)
# Create agent with GIF generation
agent = Agent(
task='Test task with GIF',
llm=mock_llm,
browser_session=browser_session,
generate_gif=True, # Enable GIF generation
cloud_sync=cloud_sync,
)
# Subscribe to all events
agent.eventbus.on('*', event_collector.collect_event)
# Subscribe to all events
agent.eventbus.on('*', event_collector.collect_event)
# Run the agent
history = await agent.run(max_steps=5)
# Run the agent
_history = await agent.run(max_steps=5)
# Verify CreateAgentOutputFileEvent was emitted
output_file_events = event_collector.get_events_by_type('CreateAgentOutputFileEvent')
@@ -255,7 +135,8 @@ class TestAgentEventLifecycle:
assert gif_bytes.startswith(b'GIF87a') or gif_bytes.startswith(b'GIF89a')
assert len(gif_bytes) > 100 # Should be a real GIF file
async def test_step_screenshot_capture(self, mock_llm, browser_session, event_collector, httpserver: HTTPServer):
@pytest.mark.usefixtures('mock_llm', 'browser_session', 'event_collector', 'httpserver')
async def test_step_screenshot_capture(self, mock_llm, browser_session, cloud_sync, event_collector, httpserver):
"""Test that screenshots are captured for each step"""
# Setup test page
@@ -264,23 +145,20 @@ class TestAgentEventLifecycle:
)
await browser_session.navigate(httpserver.url_for('/'))
# Patch environment variables to use localhost for CloudSync
with patch.dict(
os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000', 'BROWSER_USE_CLOUD_UI_URL': 'http://localhost:3000'}
):
# Create agent
agent = Agent(
task='Test screenshot capture',
llm=mock_llm,
browser_session=browser_session,
generate_gif=False,
)
# Create agent
agent = Agent(
task='Test screenshot capture',
llm=mock_llm,
browser_session=browser_session,
generate_gif=False,
cloud_sync=cloud_sync,
)
# Subscribe to all events
agent.eventbus.on('*', event_collector.collect_event)
# Subscribe to all events
agent.eventbus.on('*', event_collector.collect_event)
# Run the agent
await agent.run(max_steps=3)
# Run the agent
await agent.run(max_steps=3)
# Get all step events
step_events = event_collector.get_events_by_type('CreateAgentStepEvent')
@@ -304,75 +182,107 @@ class TestAgentEventLifecycle:
class TestAgentCloudIntegration:
"""Test that agent properly integrates with cloud sync service"""
async def test_agent_emits_events_to_cloud(self, agent_with_cloud, mock_cloud_sync):
@pytest.mark.usefixtures('agent_with_cloud', 'event_collector', 'httpserver')
async def test_agent_emits_events_to_cloud(self, agent_with_cloud, event_collector, httpserver):
"""Test that agent emits all required events to cloud sync."""
# Set up httpserver to capture events
captured_events = []
def capture_events(request):
data = request.get_json()
captured_events.extend(data.get('events', []))
from werkzeug.wrappers import Response
return Response(
'{"processed": 1, "failed": 0, "results": [{"success": true}]}', status=200, mimetype='application/json'
)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_events)
# Subscribe to eventbus to verify events
agent_with_cloud.eventbus.on('*', event_collector.collect_event)
# Run agent
await agent_with_cloud.run()
# Check that events were sent to cloud sync
calls = mock_cloud_sync.handle_event.call_args_list
assert len(calls) >= 4 # At minimum: session, task, step, update
# Verify we have the core event types in eventbus
assert len(event_collector.event_order) >= 4 # At minimum: session, task, step, update
assert 'CreateAgentSessionEvent' in event_collector.event_order
assert 'CreateAgentTaskEvent' in event_collector.event_order
assert 'CreateAgentStepEvent' in event_collector.event_order
assert 'UpdateAgentTaskEvent' in event_collector.event_order
# Verify we have the core event types
event_types = [call.args[0].event_type for call in calls]
assert 'CreateAgentSessionEvent' in event_types
assert 'CreateAgentTaskEvent' in event_types
assert 'CreateAgentStepEvent' in event_types
assert 'UpdateAgentTaskEvent' in event_types
# Verify events were sent to cloud
assert len(captured_events) >= 4
# Verify event content
session_events = [call for call in calls if call.args[0].event_type == 'CreateAgentSessionEvent']
task_events = [call for call in calls if call.args[0].event_type == 'CreateAgentTaskEvent']
step_events = [call for call in calls if call.args[0].event_type == 'CreateAgentStepEvent']
# Verify event relationships using event_collector
session_events = event_collector.get_events_by_type('CreateAgentSessionEvent')
task_events = event_collector.get_events_by_type('CreateAgentTaskEvent')
step_events = event_collector.get_events_by_type('CreateAgentStepEvent')
assert len(session_events) == 1
assert len(task_events) == 1
assert len(step_events) >= 1
# Verify event relationships
session_event = session_events[0].args[0]
task_event = task_events[0].args[0]
step_event = step_events[0].args[0]
session_event = session_events[0]
task_event = task_events[0]
step_event = step_events[0]
assert task_event.agent_session_id == session_event.id
assert step_event.agent_task_id == task_event.id
async def test_agent_emits_session_start_event(self, agent_with_cloud, mock_cloud_sync):
@pytest.mark.usefixtures('agent_with_cloud', 'event_collector', 'httpserver')
async def test_agent_emits_session_start_event(self, agent_with_cloud, event_collector, httpserver):
"""Test that agent emits session start event."""
# Set up httpserver endpoint
httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
{'processed': 1, 'failed': 0, 'results': [{'success': True}]}
)
# Subscribe to events
agent_with_cloud.eventbus.on('*', event_collector.collect_event)
# Run agent
await agent_with_cloud.run()
# Check that session start event was sent
calls = mock_cloud_sync.handle_event.call_args_list
session_events = [call for call in calls if call.args[0].event_type == 'CreateAgentSessionEvent']
session_events = event_collector.get_events_by_type('CreateAgentSessionEvent')
assert len(session_events) == 1
event = session_events[0].args[0]
event = session_events[0]
assert hasattr(event, 'id')
assert hasattr(event, 'browser_session_id')
async def test_agent_emits_task_events(self, agent_with_cloud, mock_cloud_sync):
@pytest.mark.usefixtures('agent_with_cloud', 'event_collector', 'httpserver')
async def test_agent_emits_task_events(self, agent_with_cloud, event_collector, httpserver):
"""Test that agent emits task events."""
# Set up httpserver endpoint
httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
{'processed': 1, 'failed': 0, 'results': [{'success': True}]}
)
# Subscribe to events
agent_with_cloud.eventbus.on('*', event_collector.collect_event)
# Run agent
await agent_with_cloud.run()
# Check task events
calls = mock_cloud_sync.handle_event.call_args_list
# Should have CreateAgentTaskEvent
create_task_events = [call for call in calls if call.args[0].event_type == 'CreateAgentTaskEvent']
create_task_events = event_collector.get_events_by_type('CreateAgentTaskEvent')
assert len(create_task_events) == 1
create_event = create_task_events[0].args[0]
create_event = create_task_events[0]
assert create_event.task == 'Test task'
assert hasattr(create_event, 'agent_session_id')
# Should have UpdateAgentTaskEvent when done
update_task_events = [call for call in calls if call.args[0].event_type == 'UpdateAgentTaskEvent']
update_task_events = event_collector.get_events_by_type('UpdateAgentTaskEvent')
assert len(update_task_events) >= 1
@pytest.mark.usefixtures('browser_session')
async def test_cloud_sync_disabled(self, browser_session):
"""Test that cloud sync can be disabled."""
with patch.dict(os.environ, {'BROWSERUSE_CLOUD_SYNC': 'false'}):
with patch.dict(os.environ, {'BROWSER_USE_CLOUD_SYNC': 'false'}):
agent = Agent(
task='Test task',
llm=create_mock_llm(),
@@ -384,75 +294,75 @@ class TestAgentCloudIntegration:
# Run agent - should work without cloud sync
await agent.run()
async def test_agent_error_resilience(self, agent_with_cloud, mock_cloud_sync):
@pytest.mark.usefixtures('agent_with_cloud', 'httpserver')
async def test_agent_error_resilience(self, agent_with_cloud, httpserver):
"""Test that agent continues working even if cloud sync fails."""
# Make cloud sync fail
mock_cloud_sync.handle_event.side_effect = Exception('Cloud sync error')
# Run agent - should not raise exception
# Make cloud endpoint fail
def fail_handler(request):
from werkzeug.wrappers import Response
return Response('Server error', status=500, mimetype='text/plain')
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(fail_handler)
# Run agent - should not raise exception despite cloud sync failures
result = await agent_with_cloud.run()
# Agent should complete successfully despite sync failures
assert result is not None
assert result.is_done()
# Verify cloud sync was attempted
assert mock_cloud_sync.handle_event.call_count > 0
async def test_session_id_persistence(self, browser_session):
@pytest.mark.usefixtures('browser_session', 'cloud_sync', 'event_collector', 'httpserver')
async def test_session_id_persistence(self, browser_session, cloud_sync, event_collector, httpserver):
"""Test that agent session ID persists across runs."""
mock_sync = Mock(spec=CloudSync)
mock_sync.send_event = AsyncMock()
mock_sync.handle_event = AsyncMock()
mock_sync._authenticated = True
# Set up httpserver endpoint
httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
{'processed': 1, 'failed': 0, 'results': [{'success': True}]}
)
with patch('browser_use.sync.CloudSync', return_value=mock_sync):
with patch.dict(os.environ, {'BROWSERUSE_CLOUD_SYNC': 'true'}):
# Create first agent
agent1 = Agent(
task='First task',
llm=create_mock_llm(),
browser_session=browser_session,
)
agent1.cloud_sync = mock_sync
# Create first agent
agent1 = Agent(
task='First task',
llm=create_mock_llm(),
browser_session=browser_session,
cloud_sync=cloud_sync,
)
agent1.eventbus.on('*', event_collector.collect_event)
# Run first agent
await agent1.run()
# Run first agent
await agent1.run()
# Get session ID from first run
session_calls = [
call for call in mock_sync.handle_event.call_args_list if call.args[0].event_type == 'CreateAgentSessionEvent'
]
session_id_1 = session_calls[0].args[0].id
# Get session ID from first run
session_events = event_collector.get_events_by_type('CreateAgentSessionEvent')
assert len(session_events) == 1
session_id_1 = session_events[0].id
# Create second agent (will have different session ID)
agent2 = Agent(
task='Second task',
llm=create_mock_llm(),
browser_session=browser_session,
)
agent2.cloud_sync = mock_sync
# Clear event collector
event_collector.clear()
# Clear previous calls
mock_sync.handle_event.reset_mock()
# Create second agent (will have different session ID)
agent2 = Agent(
task='Second task',
llm=create_mock_llm(),
browser_session=browser_session,
cloud_sync=cloud_sync,
)
agent2.eventbus.on('*', event_collector.collect_event)
# Run second agent
await agent2.run()
# Run second agent
await agent2.run()
# Should create new session for new agent
session_calls_2 = [
call for call in mock_sync.handle_event.call_args_list if call.args[0].event_type == 'CreateAgentSessionEvent'
]
assert len(session_calls_2) == 1 # New session created
# Should create new session for new agent
session_events_2 = event_collector.get_events_by_type('CreateAgentSessionEvent')
assert len(session_events_2) == 1 # New session created
session_id_2 = session_events_2[0].id
# Should create new task with new session ID
task_calls = [
call for call in mock_sync.handle_event.call_args_list if call.args[0].event_type == 'CreateAgentTaskEvent'
]
assert len(task_calls) == 1
session_id_2 = session_calls_2[0].args[0].id
assert task_calls[0].args[0].agent_session_id == session_id_2
assert session_id_2 != session_id_1 # Different session IDs
# Should create new task with new session ID
task_events = event_collector.get_events_by_type('CreateAgentTaskEvent')
assert len(task_events) == 1
assert task_events[0].agent_session_id == session_id_2
assert session_id_2 != session_id_1 # Different session IDs
class TestEventValidation:
@@ -475,6 +385,10 @@ class TestEventValidation:
agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c651',
task='test',
llm_model='gpt-4o',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
),
CreateAgentStepEvent(
user_id='0683fb03-c5da-79c9-8000-d3a39c47c650',
@@ -484,6 +398,7 @@ class TestEventValidation:
memory='mem',
next_goal='next',
actions=[],
screenshot_url='data:image/png;...',
),
]
@@ -512,12 +427,23 @@ class TestEventValidation:
agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c659',
llm_model='test-model',
task=long_task,
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
def test_event_type_assignment(self):
"""Test that event_type is properly set and validated"""
event = CreateAgentTaskEvent(
user_id='test', agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c659', llm_model='test-model', task='test'
user_id='test',
agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c659',
llm_model='test-model',
task='test',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
# Event type should be automatically set

View File

@@ -3,13 +3,13 @@
import os
import tempfile
from pathlib import Path
from unittest.mock import patch
import httpx
import pytest
from bubus import BaseEvent
from pytest_httpserver import HTTPServer
from browser_use.agent.cloud_events import CreateAgentTaskEvent
from browser_use.sync.auth import TEMP_USER_ID, DeviceAuthClient
from browser_use.sync.service import CloudSync
@@ -21,24 +21,10 @@ def temp_config_dir():
temp_dir = Path(tmpdir) / '.config' / 'browseruse'
temp_dir.mkdir(parents=True, exist_ok=True)
# Temporarily replace the config dir
import browser_use.sync.auth
import browser_use.utils
original_auth = getattr(browser_use.sync.auth, 'BROWSER_USE_CONFIG_DIR', None)
original_utils = getattr(browser_use.utils, 'BROWSER_USE_CONFIG_DIR', None)
browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = temp_dir
browser_use.utils.BROWSER_USE_CONFIG_DIR = temp_dir
os.environ['BROWSER_USE_CONFIG_DIR'] = str(temp_dir)
yield temp_dir
# Restore original
if original_auth:
browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = original_auth
if original_utils:
browser_use.utils.BROWSER_USE_CONFIG_DIR = original_utils
@pytest.fixture
async def http_client(httpserver: HTTPServer):
@@ -52,26 +38,23 @@ class TestCloudSyncInit:
async def test_init_with_auth_enabled(self, temp_config_dir):
"""Test CloudSync initialization with auth enabled."""
# Set test environment variable
with patch.dict(os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000'}):
service = CloudSync(enable_auth=True)
service = CloudSync(enable_auth=True, base_url='http://localhost:8000')
assert service.base_url == 'http://localhost:8000'
assert service.enable_auth is True
assert service.auth_client is not None
assert isinstance(service.auth_client, DeviceAuthClient)
assert service.pending_events == []
assert service.session_id is None
assert service.base_url == 'http://localhost:8000'
assert service.enable_auth is True
assert service.auth_client is not None
assert isinstance(service.auth_client, DeviceAuthClient)
assert service.pending_events == []
assert service.session_id is None
async def test_init_with_auth_disabled(self, temp_config_dir):
"""Test CloudSync initialization with auth disabled."""
with patch.dict(os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000'}):
service = CloudSync(enable_auth=False)
service = CloudSync(enable_auth=False, base_url='http://localhost:8000')
assert service.base_url == 'http://localhost:8000'
assert service.enable_auth is False
assert service.auth_client is None
assert service.pending_events == []
assert service.base_url == 'http://localhost:8000'
assert service.enable_auth is False
assert service.auth_client is None
assert service.pending_events == []
class TestCloudSyncEventHandling:
@@ -107,10 +90,21 @@ class TestCloudSyncEventHandling:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Send event
await authenticated_sync.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task', priority='high'))
await authenticated_sync.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Verify forwarding
assert len(requests) == 1
@@ -122,22 +116,32 @@ class TestCloudSyncEventHandling:
assert event['user_id'] == 'test-user-123'
# BaseEvent creates event_type attribute, plus our custom data as attributes
assert event['task'] == 'Test task'
assert event['priority'] == 'high'
async def test_event_queueing_unauthenticated(self, httpserver: HTTPServer, unauthenticated_sync):
"""Test event queueing when unauthenticated."""
# Server returns 401
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_json({'error': 'unauthorized'}, status=401)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_json({'error': 'unauthorized'}, status=401)
# Send event
await unauthenticated_sync.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Queued task'))
await unauthenticated_sync.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Queued task',
user_id=TEMP_USER_ID,
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Event should be queued
assert len(unauthenticated_sync.pending_events) == 1
queued_event = unauthenticated_sync.pending_events[0]
assert queued_event['event_type'] == 'CreateAgentTaskEvent'
assert queued_event['user_id'] == TEMP_USER_ID
assert queued_event['task'] == 'Queued task'
assert queued_event.event_type == 'CreateAgentTaskEvent'
assert queued_event.user_id == TEMP_USER_ID
assert queued_event.task == 'Queued task'
async def test_event_user_id_injection_pre_auth(self, httpserver: HTTPServer, unauthenticated_sync):
"""Test that temp user ID is injected for pre-auth events."""
@@ -149,10 +153,21 @@ class TestCloudSyncEventHandling:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Send event without user_id
await unauthenticated_sync.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Pre-auth task'))
await unauthenticated_sync.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Pre-auth task',
user_id=TEMP_USER_ID,
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Verify temp user ID was injected
assert len(requests) == 1
@@ -185,21 +200,31 @@ class TestCloudSyncRetryLogic:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Manually add pending events (simulating 401 scenario)
sync_with_auth.pending_events.extend(
[
{
'event_type': 'CreateAgentTaskEvent',
'task': 'Pending task 1',
'user_id': TEMP_USER_ID,
},
{
'event_type': 'CreateAgentTaskEvent',
'task': 'Pending task 2',
'user_id': TEMP_USER_ID,
},
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Pending task 1',
user_id=TEMP_USER_ID,
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
),
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Pending task 2',
user_id=TEMP_USER_ID,
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
),
]
)
@@ -219,10 +244,21 @@ class TestCloudSyncRetryLogic:
async def test_backend_error_resilience(self, httpserver: HTTPServer, sync_with_auth):
"""Test resilience to backend errors."""
# Server returns 500 error
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_data('Internal Server Error', status=500)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_data('Internal Server Error', status=500)
# Should not raise exception
await sync_with_auth.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Task during outage'))
await sync_with_auth.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Task during outage',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Events should not be queued for 500 errors (only 401)
assert len(sync_with_auth.pending_events) == 0
@@ -233,7 +269,18 @@ class TestCloudSyncRetryLogic:
sync_with_auth.base_url = 'http://localhost:99999' # Invalid port
# Should not raise exception
await sync_with_auth.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Task during network error'))
await sync_with_auth.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Task during network error',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Should handle gracefully without crashing
@@ -249,12 +296,23 @@ class TestCloudSyncRetryLogic:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Send multiple events concurrently
tasks = []
for i in range(5):
task = sync_with_auth.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Concurrent task {i}'))
task = sync_with_auth.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task=f'Concurrent task {i}',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
tasks.append(task)
await asyncio.gather(*tasks)
@@ -286,7 +344,7 @@ class TestCloudSyncBackendCommunication:
assert len(data['events']) == 1
event = data['events'][0]
required_fields = ['event_type', 'event_id', 'event_at', 'event_schema', 'data']
required_fields = ['event_type', 'event_id', 'event_created_at', 'event_schema', 'user_id']
for field in required_fields:
assert field in event, f'Missing required field: {field}'
@@ -294,7 +352,7 @@ class TestCloudSyncBackendCommunication:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Create authenticated service
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -305,7 +363,18 @@ class TestCloudSyncBackendCommunication:
service.auth_client = auth
service.session_id = 'test-session-id'
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Format validation test'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Format validation test',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
assert len(requests) == 1
@@ -324,7 +393,7 @@ class TestCloudSyncBackendCommunication:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Test authenticated request
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -334,7 +403,18 @@ class TestCloudSyncBackendCommunication:
service = CloudSync(base_url=httpserver.url_for(''), enable_auth=True)
service.auth_client = auth
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Auth header test'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Auth header test',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Check auth header was included
assert len(requests) == 1
@@ -346,7 +426,18 @@ class TestCloudSyncBackendCommunication:
requests.clear()
service.auth_client = DeviceAuthClient(base_url=httpserver.url_for('')) # No credentials
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='No auth test'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='No auth test',
user_id='',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Check no auth header
assert len(requests) == 1
@@ -368,7 +459,18 @@ class TestCloudSyncErrorHandling:
sync_service.base_url = 'http://10.255.255.1' # Non-routable IP for timeout
# Should not raise exception
await sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Timeout test'))
await sync_service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Timeout test',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
async def test_malformed_event_handling(self, httpserver: HTTPServer, sync_service):
"""Test handling of events that can't be serialized."""
@@ -389,20 +491,42 @@ class TestCloudSyncErrorHandling:
error_codes = [400, 403, 404, 429, 500, 502, 503]
for status_code in error_codes:
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_json(
httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
{'error': f'Test error {status_code}'}, status=status_code
)
# Should not raise exception
await sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Error {status_code} test'))
await sync_service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task=f'Error {status_code} test',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
async def test_invalid_response_handling(self, httpserver: HTTPServer, sync_service):
"""Test handling of invalid server responses."""
# Return invalid JSON
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_data('Not JSON', status=200)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_data('Not JSON', status=200)
# Should not raise exception
await sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Invalid response test'))
await sync_service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Invalid response test',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
async def test_event_with_restricted_attributes(self, httpserver: HTTPServer, sync_service):
"""Test handling events that don't allow user_id attribute."""
@@ -415,7 +539,7 @@ class TestCloudSyncErrorHandling:
event_type: str = 'RestrictedEvent'
data: str = 'test'
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_json({'processed': 1}, status=200)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_json({'processed': 1}, status=200)
# Should not raise exception - will log debug message about not being able to set user_id
await sync_service.handle_event(RestrictedEvent())
@@ -441,12 +565,23 @@ class TestCloudSyncErrorHandling:
return Response('{"processed": 1}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(handler)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(handler)
# Send 10 events concurrently
tasks = []
for i in range(10):
task = sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Concurrent error test {i}'))
task = sync_service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task=f'Concurrent error test {i}',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
tasks.append(task)
# All should complete without raising

View File

@@ -6,7 +6,6 @@ import json
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import patch
import anyio
import httpx
@@ -17,37 +16,26 @@ from pytest_httpserver import HTTPServer
# Load environment variables before any imports
load_dotenv()
from bubus import BaseEvent
from browser_use.agent.cloud_events import CreateAgentSessionEvent, CreateAgentTaskEvent
from browser_use.sync.auth import TEMP_USER_ID, DeviceAuthClient
from browser_use.sync.service import CloudSync
# Define config dir for tests
# BROWSER_USE_CONFIG_DIR = Path.home() / ".config" / "browseruse"
BROWSER_USE_CONFIG_DIR = Path(tempfile.mkdtemp()) / '.config' / 'browseruse'
# Define config dir for tests - not needed anymore since we'll use env vars
@pytest.fixture
def temp_config_dir():
def temp_config_dir(monkeypatch):
"""Create temporary config directory."""
with tempfile.TemporaryDirectory() as tmpdir:
temp_dir = Path(tmpdir) / '.config' / 'browseruse'
temp_dir.mkdir(parents=True, exist_ok=True)
# Temporarily replace the config dir
original = BROWSER_USE_CONFIG_DIR
import browser_use.sync.auth
import browser_use.utils
browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = temp_dir
browser_use.utils.BROWSER_USE_CONFIG_DIR = temp_dir
# Use monkeypatch to set the environment variable
monkeypatch.setenv('BROWSER_USE_CONFIG_DIR', str(temp_dir))
yield temp_dir
# Restore original
browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = original
browser_use.utils.BROWSER_USE_CONFIG_DIR = original
@pytest.fixture
async def http_client(httpserver: HTTPServer):
@@ -59,23 +47,23 @@ async def http_client(httpserver: HTTPServer):
class TestDeviceAuthClient:
"""Test DeviceAuthClient class."""
async def test_init_creates_config_dir(self, temp_config_dir):
async def test_init_creates_config_dir(self, temp_config_dir, httpserver):
"""Test that initialization creates config directory."""
auth = DeviceAuthClient()
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
assert temp_config_dir.exists()
assert (temp_config_dir / 'cloud_auth.json').exists() is False
async def test_load_credentials_no_file(self, temp_config_dir):
async def test_load_credentials_no_file(self, temp_config_dir, httpserver):
"""Test loading credentials when file doesn't exist."""
auth = DeviceAuthClient()
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
# When no file exists, auth_config should have no token/user_id
assert auth.auth_config.api_token is None
assert auth.auth_config.user_id is None
assert not auth.is_authenticated
async def test_save_and_load_credentials(self, temp_config_dir):
async def test_save_and_load_credentials(self, temp_config_dir, httpserver):
"""Test saving and loading credentials."""
auth = DeviceAuthClient()
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
# Update auth config and save
auth.auth_config.api_token = 'test-key-123'
@@ -84,7 +72,7 @@ class TestDeviceAuthClient:
auth.auth_config.save_to_file()
# Load in a new instance
auth2 = DeviceAuthClient()
auth2 = DeviceAuthClient(base_url=httpserver.url_for(''))
assert auth2.auth_config.api_token == 'test-key-123'
assert auth2.auth_config.user_id == 'test-user-123'
assert auth2.is_authenticated
@@ -94,9 +82,9 @@ class TestDeviceAuthClient:
stat = (temp_config_dir / 'cloud_auth.json').stat()
assert oct(stat.st_mode)[-3:] == '600'
async def test_is_authenticated(self, temp_config_dir):
async def test_is_authenticated(self, temp_config_dir, httpserver):
"""Test authentication status check."""
auth = DeviceAuthClient()
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
# Not authenticated initially
assert auth.is_authenticated is False
@@ -107,12 +95,12 @@ class TestDeviceAuthClient:
auth.auth_config.save_to_file()
# Reload to verify persistence
auth2 = DeviceAuthClient()
auth2 = DeviceAuthClient(base_url=httpserver.url_for(''))
assert auth2.is_authenticated is True
async def test_get_credentials(self, temp_config_dir):
async def test_get_credentials(self, temp_config_dir, httpserver):
"""Test getting credentials."""
auth = DeviceAuthClient()
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
# No credentials initially
assert auth.api_token is None
@@ -299,9 +287,9 @@ class TestDeviceAuthClient:
assert result is None # Should timeout and return None
assert not auth.is_authenticated
async def test_logout(self, temp_config_dir):
async def test_logout(self, temp_config_dir, httpserver):
"""Test logout functionality."""
auth = DeviceAuthClient()
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
# Save credentials directly using auth_config
auth.auth_config.api_token = 'test-key'
@@ -319,7 +307,7 @@ class TestDeviceAuthClient:
assert (temp_config_dir / 'cloud_auth.json').exists()
# Verify the file contains empty credentials
auth2 = DeviceAuthClient()
auth2 = DeviceAuthClient(base_url=httpserver.url_for(''))
assert auth2.auth_config.api_token is None
assert auth2.auth_config.user_id is None
@@ -327,14 +315,14 @@ class TestDeviceAuthClient:
class TestCloudSync:
"""Test CloudSync class."""
async def test_init(self, temp_config_dir):
async def test_init(self, temp_config_dir, httpserver):
"""Test CloudSync initialization."""
service = CloudSync(
base_url='https://cloud.browser-use.com',
base_url=httpserver.url_for(''),
enable_auth=True,
)
assert service.base_url == 'https://cloud.browser-use.com'
assert service.base_url == httpserver.url_for('')
assert service.enable_auth is True
assert service.auth_client is not None
assert isinstance(service.auth_client, DeviceAuthClient)
@@ -355,7 +343,7 @@ class TestCloudSync:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Create authenticated service
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -367,12 +355,18 @@ class TestCloudSync:
service.session_id = 'test-session-id'
# Send event
event_data = {
'task': 'Test task',
'status': 'running',
}
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', **event_data))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Check request was made
assert len(requests) == 1
@@ -388,7 +382,6 @@ class TestCloudSync:
assert event['event_type'] == 'CreateAgentTaskEvent'
assert event['user_id'] == 'test-user-123'
assert event['task'] == 'Test task'
assert event['status'] == 'running'
async def test_send_event_pre_auth(self, httpserver: HTTPServer, temp_config_dir):
"""Test sending event before authentication."""
@@ -405,7 +398,7 @@ class TestCloudSync:
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
# Create unauthenticated service
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -416,12 +409,18 @@ class TestCloudSync:
service.session_id = 'test-session-id'
# Send event
event_data = {
'task': 'Test task',
'status': 'running',
}
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', **event_data))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task',
user_id=TEMP_USER_ID,
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Check request was made without auth header
assert len(requests) == 1
@@ -435,7 +434,6 @@ class TestCloudSync:
assert event['event_type'] == 'CreateAgentTaskEvent'
assert event['user_id'] == TEMP_USER_ID
assert event['task'] == 'Test task'
assert event['status'] == 'running'
async def test_authenticate_and_resend(self, httpserver: HTTPServer, temp_config_dir):
"""Test authentication flow with pre-auth event resending."""
@@ -461,7 +459,7 @@ class TestCloudSync:
# Subsequent requests: success
return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(handle_events_request)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(handle_events_request)
# Create service with unauthenticated auth client
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -472,12 +470,23 @@ class TestCloudSync:
service.session_id = 'test-session-id'
# Send pre-auth event (should get 401 and be queued)
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Pre-auth task'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Pre-auth task',
user_id=TEMP_USER_ID,
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Event should be in pending_events since we got 401
assert len(service.pending_events) == 1
assert service.pending_events[0]['task'] == 'Pre-auth task'
assert service.pending_events[0]['user_id'] == TEMP_USER_ID
assert hasattr(service.pending_events[0], 'task') and service.pending_events[0].task == 'Pre-auth task' # type: ignore
assert hasattr(service.pending_events[0], 'user_id') and service.pending_events[0].user_id == TEMP_USER_ID # type: ignore
# Now authenticate the auth client
auth.auth_config.api_token = 'test-api-key'
@@ -503,7 +512,7 @@ class TestCloudSync:
async def test_error_handling(self, httpserver: HTTPServer, temp_config_dir):
"""Test error handling during event sending."""
# Set up server to return 500 error
httpserver.expect_request('/api/v1/events/', method='POST').respond_with_data('Internal Server Error', status=500)
httpserver.expect_request('/api/v1/events', method='POST').respond_with_data('Internal Server Error', status=500)
# Create service with real auth
auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -515,7 +524,18 @@ class TestCloudSync:
service.session_id = 'test-session-id'
# Send event - should not raise exception but handle gracefully
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Should handle error gracefully without crashing
@@ -561,10 +581,8 @@ class TestCloudSync:
content = '\n'.join(json.dumps(event) for event in events) + '\n'
await anyio.Path(wal_path).write_text(content)
# Patch BROWSER_USE_CONFIG_DIR to point to our temp directory
with patch('browser_use.utils.BROWSER_USE_CONFIG_DIR', temp_config_dir):
# Call the method under test
await service._update_wal_user_ids(service.session_id)
# Call the method under test (temp_config_dir fixture already sets the env var)
await service._update_wal_user_ids(service.session_id)
# Read back the updated file and verify changes
content = await anyio.Path(wal_path).read_text()
@@ -645,7 +663,7 @@ class TestIntegration:
# Set up events endpoint
httpserver.expect_request(
'/api/v1/events/',
'/api/v1/events',
method='POST',
).respond_with_json({'processed': 1, 'failed': 0})
@@ -654,17 +672,36 @@ class TestIntegration:
service.session_id = 'test-session-id'
# Send pre-auth event
await service.handle_event(BaseEvent(event_type='CreateAgentSessionEvent', started_at=datetime.utcnow().isoformat()))
await service.handle_event(
CreateAgentSessionEvent(
user_id=TEMP_USER_ID,
browser_session_id='test-browser-session',
browser_session_live_url='http://example.com/live',
browser_session_cdp_url='ws://example.com/cdp',
)
)
# Authenticate
authenticated = await service.authenticate(show_instructions=False)
assert authenticated is True
assert service.auth_client is not None
assert service.auth_client.is_authenticated
assert service.auth_client.api_token == 'test-api-key'
assert service.auth_client.user_id == 'test-user-123'
# Send authenticated event
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Authenticated task'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Authenticated task',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Verify auth was saved
auth_file = temp_config_dir / 'cloud_auth.json'
@@ -715,7 +752,7 @@ class TestAuthResilience:
# Now simulate token expiry by returning 401 errors
httpserver.expect_request(
'/api/v1/events/',
'/api/v1/events',
method='POST',
).respond_with_json({'error': 'unauthorized', 'detail': 'Token expired'}, status=401)
@@ -726,7 +763,18 @@ class TestAuthResilience:
service.auth_client = auth
# Send event - should not raise exception even though token is expired
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task after token expiry'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task after token expiry',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Agent should continue functioning despite sync failure
assert True # No exception raised
@@ -753,12 +801,23 @@ class TestAuthResilience:
# Set up events endpoint to handle unauthenticated requests
httpserver.expect_request(
'/api/v1/events/',
'/api/v1/events',
method='POST',
).respond_with_json({'processed': 1, 'failed': 0})
# Should be able to send events without auth (pre-auth mode)
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task without auth'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task without auth',
user_id='',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
async def test_server_downtime_resilience(self, httpserver: HTTPServer, http_client, temp_config_dir):
"""Test that server downtime doesn't break the agent."""
@@ -777,7 +836,18 @@ class TestAuthResilience:
# Should be able to send events even when server is down
# They will be queued locally
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task during server downtime'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task during server downtime',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
async def test_excessive_event_queue_handling(self, httpserver: HTTPServer, http_client, temp_config_dir):
"""Test that excessive event queuing doesn't break the agent."""
@@ -790,7 +860,18 @@ class TestAuthResilience:
# Send many events while server is down (no responses configured)
for i in range(100):
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Test task {i}'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task=f'Test task {i}',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)
# Agent should still be functioning
assert True # No memory issues or crashes
@@ -813,7 +894,7 @@ class TestAuthResilience:
# Set up another malformed response for events
httpserver.expect_request(
'/api/v1/events/',
'/api/v1/events',
method='POST',
).respond_with_data('malformed response', status=500)
@@ -823,4 +904,15 @@ class TestAuthResilience:
service.auth_client = auth
# Should handle malformed event response gracefully
await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task with malformed response'))
await service.handle_event(
CreateAgentTaskEvent(
agent_session_id='test-session',
llm_model='test-model',
task='Test task with malformed response',
user_id='test-user-123',
done_output=None,
user_feedback_type=None,
user_comment=None,
gif_url=None,
)
)

View File

@@ -1,124 +0,0 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
async def analyze_page_structure(url: str):
"""Analyze and print the structure of a webpage with enhanced debugging"""
browser = Browser(
config=BrowserConfig(
headless=False, # Set to True if you don't need to see the browser
),
user_data_dir=None,
)
context = BrowserContext(browser=browser)
try:
async with context as ctx:
# Navigate to the URL
page = await ctx.get_current_page()
await page.goto(url)
await page.wait_for_load_state('networkidle')
# Get viewport dimensions
viewport_info = await page.evaluate("""() => {
return {
viewport: {
width: window.innerWidth,
height: window.innerHeight,
scrollX: window.scrollX,
scrollY: window.scrollY
}
}
}""")
print('\nViewport Information:')
print(f'Width: {viewport_info["viewport"]["width"]}')
print(f'Height: {viewport_info["viewport"]["height"]}')
print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}')
print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}')
# Enhanced debug information for cookie consent and fixed position elements
debug_info = await page.evaluate("""() => {
function getElementInfo(element) {
const rect = element.getBoundingClientRect();
const style = window.getComputedStyle(element);
return {
tag: element.tagName.toLowerCase(),
id: element.id,
className: element.className,
position: style.position,
rect: {
top: rect.top,
right: rect.right,
bottom: rect.bottom,
left: rect.left,
width: rect.width,
height: rect.height
},
isFixed: style.position === 'fixed',
isSticky: style.position === 'sticky',
zIndex: style.zIndex,
visibility: style.visibility,
display: style.display,
opacity: style.opacity
};
}
// Find cookie-related elements
const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]'));
const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => {
const style = window.getComputedStyle(el);
return style.position === 'fixed' || style.position === 'sticky';
});
return {
cookieElements: cookieElements.map(getElementInfo),
fixedElements: fixedElements.map(getElementInfo)
};
}""")
print('\nCookie-related Elements:')
for elem in debug_info['cookieElements']:
print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
print(f'Position: {elem["position"]}')
print(f'Rect: {elem["rect"]}')
print(f'Z-Index: {elem["zIndex"]}')
print(f'Visibility: {elem["visibility"]}')
print(f'Display: {elem["display"]}')
print(f'Opacity: {elem["opacity"]}')
print('\nFixed/Sticky Position Elements:')
for elem in debug_info['fixedElements']:
print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
print(f'Position: {elem["position"]}')
print(f'Rect: {elem["rect"]}')
print(f'Z-Index: {elem["zIndex"]}')
print(f'\nPage Structure for {url}:\n')
structure = await ctx.get_page_structure()
print(structure)
input('Press Enter to close the browser...')
finally:
await browser.close()
if __name__ == '__main__':
# You can modify this URL to analyze different pages
urls = [
'https://www.mlb.com/yankees/stats/',
'https://immobilienscout24.de',
'https://www.zeiss.com/career/en/job-search.html?page=1',
'https://www.zeiss.com/career/en/job-search.html?page=1',
'https://reddit.com',
]
for url in urls:
asyncio.run(analyze_page_structure(url))

View File

@@ -7,6 +7,7 @@ from langchain_openai import ChatOpenAI
from browser_use.agent.prompts import AgentMessagePrompt
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.dom.service import DomService
from browser_use.filesystem.file_system import FileSystem
def count_string_tokens(string: str, model: str) -> tuple[int, float]:
@@ -101,7 +102,7 @@ async def test_focus_vs_all_elements():
# print(all_elements_state.element_tree.clickable_elements_to_string())
prompt = AgentMessagePrompt(
browser_state_summary=all_elements_state,
result=None,
file_system=FileSystem(working_dir='./tmp'),
include_attributes=DEFAULT_INCLUDE_ATTRIBUTES,
step_info=None,
)
@@ -110,9 +111,15 @@ async def test_focus_vs_all_elements():
user_message = prompt.get_user_message(use_vision=False).content
os.makedirs('./tmp', exist_ok=True)
async with await anyio.open_file('./tmp/user_message.txt', 'w', encoding='utf-8') as f:
await f.write(user_message)
if isinstance(user_message, str):
await f.write(user_message)
else:
await f.write(str(user_message))
token_count, price = count_string_tokens(user_message, model='gpt-4o')
if isinstance(user_message, str):
token_count, price = count_string_tokens(user_message, model='gpt-4o')
else:
token_count, price = count_string_tokens(str(user_message), model='gpt-4o')
print(f'Prompt token count: {token_count}, price: {round(price, 4)} USD')
print('User message written to ./tmp/user_message.txt')

View File

@@ -17,7 +17,7 @@
# from browser_use import Agent
# from browser_use.browser.browser import BrowserSession
# from browser_use.sync.service import CloudSync
# from tests.ci.mocks import create_mock_llm
# from tests.ci.conftest import create_mock_llm
# logger = logging.getLogger(__name__)
@@ -77,11 +77,11 @@
# Environment variables required:
# - RUN_LIVE_TESTS=1 (to enable the test)
# - BROWSER_USE_CLOUD_URL (optional, defaults to https://cloud.browser-use.com)
# - BROWSER_USE_CLOUD_API_URL (optional, defaults to https://cloud.browser-use.com)
# """
# # Configuration
# backend_url = os.getenv('BROWSER_USE_CLOUD_URL', 'http://localhost:8000')
# backend_url = os.getenv('BROWSER_USE_CLOUD_API_URL', 'http://localhost:8000')
# logger.info(f'Running live integration test against: {backend_url}')
# # Create mock LLM
@@ -89,7 +89,7 @@
# # Set environment variables for cloud sync
# os.environ['BROWSERUSE_CLOUD_SYNC'] = 'true'
# os.environ['BROWSER_USE_CLOUD_URL'] = backend_url
# os.environ['BROWSER_USE_CLOUD_API_URL'] = backend_url
# # Create browser session with real profile
# browser_session = BrowserSession(
@@ -147,7 +147,7 @@
# This is a simpler test that just verifies event sending works.
# """
# backend_url = os.getenv('BROWSER_USE_CLOUD_URL', 'http://localhost:8000')
# backend_url = os.getenv('BROWSER_USE_CLOUD_API_URL', 'http://localhost:8000')
# logger.info(f'Testing cloud sync against: {backend_url}')
# # Create cloud sync service

View File

@@ -25,10 +25,10 @@ def llm():
@pytest.fixture
async def browser_session():
browser_session = BrowserSession(
headless=True,
user_data_dir=None,
)
from browser_use.browser.profile import BrowserProfile
profile = BrowserProfile(headless=True, user_data_dir=None)
browser_session = BrowserSession(browser_profile=profile)
await browser_session.start()
yield browser_session
await browser_session.stop()

View File

@@ -68,7 +68,10 @@ class TestCoreFunctionality:
@pytest.fixture(scope='module')
async def browser_session(self):
"""Create and provide a BrowserSession instance with security disabled."""
browser_session = BrowserSession(headless=True, user_data_dir=None)
from browser_use.browser.profile import BrowserProfile
profile = BrowserProfile(headless=True, user_data_dir=None)
browser_session = BrowserSession(browser_profile=profile)
yield browser_session
await browser_session.kill()

View File

@@ -21,10 +21,10 @@ class MockLLM:
@pytest.fixture(scope='module')
async def browser_session():
browser_session = BrowserSession(
headless=True,
user_data_dir=None,
)
from browser_use.browser.profile import BrowserProfile
profile = BrowserProfile(headless=True, user_data_dir=None)
browser_session = BrowserSession(browser_profile=profile)
await browser_session.start()
yield browser_session
await browser_session.stop()

Some files were not shown because too many files have changed in this diff Show More