Merge branch 'main' into feat/evals-anchor-support

This commit is contained in:
Aitor
2025-07-02 14:23:45 +02:00
committed by GitHub
19 changed files with 307 additions and 163 deletions

View File

@@ -7,7 +7,9 @@ on:
jobs:
run_evaluation:
runs-on: ubuntu-latest
runs-on:
group: eval
labels: eval-2-core-500
timeout-minutes: 360
env:
IN_DOCKER: 'true'
@@ -104,6 +106,13 @@ jobs:
ps aux | wc -l
echo "================================="
- name: Construct GitHub Workflow URL
id: github_url
run: |
GITHUB_WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
echo "GITHUB_WORKFLOW_URL=$GITHUB_WORKFLOW_URL" >> $GITHUB_OUTPUT
echo "::notice title=Workflow URL::Workflow URL: $GITHUB_WORKFLOW_URL"
- name: Construct eval command
id: eval_command
run: |
@@ -216,6 +225,9 @@ jobs:
[[ -n "$TASK_TEXT" ]] && CMD_ARGS+=("--task-text" "$TASK_TEXT")
[[ -n "$TASK_WEBSITE" ]] && CMD_ARGS+=("--task-website" "$TASK_WEBSITE")
# Add GitHub workflow URL
[[ -n "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}" ]] && CMD_ARGS+=("--github-workflow-url" "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}")
# Convert array to command string with proper escaping
printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"

View File

@@ -4,8 +4,7 @@ import json
import logging
from browser_use.agent.message_manager.views import (
MessageMetadata,
SupportedMessageTypes,
HistoryItem,
)
from browser_use.agent.prompts import AgentMessagePrompt
from browser_use.agent.views import (
@@ -106,6 +105,7 @@ class MessageManager:
include_attributes: list[str] | None = None,
message_context: str | None = None,
sensitive_data: dict[str, str | dict[str, str]] | None = None,
max_history_items: int | None = None,
):
self.task = task
self.state = state
@@ -114,6 +114,9 @@ class MessageManager:
self.sensitive_data_description = ''
self.available_file_paths = available_file_paths
self.use_thinking = use_thinking
self.max_history_items = max_history_items
assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'
# Store settings as direct attributes instead of in a settings object
self.include_attributes = include_attributes or []
@@ -124,16 +127,45 @@ class MessageManager:
if len(self.state.history.messages) == 0:
self._init_messages()
@property
def agent_history_description(self) -> str:
"""Build agent history description from list of items, respecting max_history_items limit"""
if self.max_history_items is None:
# Include all items
return '\n'.join(item.to_string() for item in self.state.agent_history_items)
total_items = len(self.state.agent_history_items)
# If we have fewer items than the limit, just return all items
if total_items <= self.max_history_items:
return '\n'.join(item.to_string() for item in self.state.agent_history_items)
# We have more items than the limit, so we need to omit some
omitted_count = total_items - self.max_history_items
# Show first item + omitted message + most recent (max_history_items - 1) items
# The omitted message doesn't count against the limit, only real history items do
recent_items_count = self.max_history_items - 1 # -1 for first item
items_to_include = [
self.state.agent_history_items[0].to_string(), # Keep first item (initialization)
f'<sys>[... {omitted_count} previous steps omitted...]</sys>',
]
# Add most recent items
items_to_include.extend([item.to_string() for item in self.state.agent_history_items[-recent_items_count:]])
return '\n'.join(items_to_include)
def _init_messages(self) -> None:
"""Initialize the message history with system message, context, task, and other initial messages"""
self._add_message_with_type(self.system_prompt, message_type='init')
self._add_message_with_type(self.system_prompt)
placeholder_message = UserMessage(
content='<example_1>\nHere is an example output of thinking and tool call. You can use it as a reference but do not copy it exactly.',
cache=True,
)
# placeholder_message = HumanMessage(content='Example output:')
self._add_message_with_type(placeholder_message, message_type='init')
self._add_message_with_type(placeholder_message)
# Create base example content
example_content = {
@@ -173,18 +205,18 @@ After writing todo.md, I can also initialize a github.md file to accumulate the
The file system actions do not change the browser state, so I can also click on the bytedance/UI-TARS-desktop (index [4]) to start collecting information."""
example_tool_call_1 = AssistantMessage(content=json.dumps(example_content), cache=True)
self._add_message_with_type(example_tool_call_1, message_type='init')
self._add_message_with_type(example_tool_call_1)
self._add_message_with_type(
UserMessage(
content='Data written to todo.md.\nData written to github.md.\nClicked element with index 4.\n</example_1>',
cache=True,
),
message_type='init',
)
def add_new_task(self, new_task: str) -> None:
self.task = new_task
self.state.agent_history_description += f'\n<s>User updated <user_request> to: {new_task}</s>\n'
task_update_item = HistoryItem(system_message=f'User updated <user_request> to: {new_task}')
self.state.agent_history_items.append(task_update_item)
def _update_agent_history_description(
self,
@@ -196,7 +228,7 @@ The file system actions do not change the browser state, so I can also click on
if result is None:
result = []
step_number = step_info.step_number if step_info else 'unknown'
step_number = step_info.step_number if step_info else None
self.state.read_state_description = ''
@@ -220,23 +252,23 @@ The file system actions do not change the browser state, so I can also click on
if action_results:
action_results = f'Action Results:\n{action_results}'
action_results = action_results.strip('\n')
action_results = action_results.strip('\n') if action_results else None
# Handle case where model_output is None (e.g., parsing failed)
# Build the history item
if model_output is None:
if isinstance(step_number, int) and step_number > 0:
self.state.agent_history_description += f"""<step_{step_number}>
Agent failed to output in the right format.
</step_{step_number}>
"""
# Only add error history item if we have a valid step number
if step_number is not None and step_number > 0:
history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
self.state.agent_history_items.append(history_item)
else:
self.state.agent_history_description += f"""<step_{step_number}>
Evaluation of Previous Step: {model_output.current_state.evaluation_previous_goal}
Memory: {model_output.current_state.memory}
Next Goal: {model_output.current_state.next_goal}
{action_results}
</step_{step_number}>
"""
history_item = HistoryItem(
step_number=step_number,
evaluation_previous_goal=model_output.current_state.evaluation_previous_goal,
memory=model_output.current_state.memory,
next_goal=model_output.current_state.next_goal,
action_results=action_results,
)
self.state.agent_history_items.append(history_item)
def _get_sensitive_data_description(self, current_page_url) -> str:
sensitive_data = self.sensitive_data
@@ -284,7 +316,7 @@ Next Goal: {model_output.current_state.next_goal}
state_message = AgentMessagePrompt(
browser_state_summary=browser_state_summary,
file_system=self.file_system,
agent_history_description=self.state.agent_history_description,
agent_history_description=self.agent_history_description,
read_state_description=self.state.read_state_description,
task=self.task,
include_attributes=self.include_attributes,
@@ -346,16 +378,15 @@ Next Goal: {model_output.current_state.next_goal}
# Log message history for debugging
logger.debug(self._log_history_lines())
self.last_input_messages = [m.message for m in self.state.history.messages]
self.last_input_messages = list(self.state.history.messages)
return self.last_input_messages
def _add_message_with_type(
self,
message: BaseMessage,
position: int | None = None,
message_type: SupportedMessageTypes | None = None,
) -> None:
"""Add message with token count metadata
"""Add message to history
position: None for last, -1 for second last, etc.
"""
@@ -363,8 +394,7 @@ Next Goal: {model_output.current_state.next_goal}
if self.sensitive_data:
message = self._filter_sensitive_data(message)
metadata = MessageMetadata(message_type=message_type)
self.state.history.add_message(message, metadata, position)
self.state.history.add_message(message, position)
@time_execution_sync('--filter_sensitive_data')
def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage:

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING
from pydantic import BaseModel, ConfigDict, Field
@@ -13,43 +13,74 @@ if TYPE_CHECKING:
pass
SupportedMessageTypes = Literal['init', 'memory']
class HistoryItem(BaseModel):
"""Represents a single agent history item with its data and string representation"""
class MessageMetadata(BaseModel):
"""Metadata for a message"""
message_type: SupportedMessageTypes | None = None
class ManagedMessage(BaseModel):
"""A message with its metadata"""
message: BaseMessage
metadata: MessageMetadata = Field(default_factory=MessageMetadata)
class MessageHistory(BaseModel):
"""History of messages with metadata"""
messages: list[ManagedMessage] = Field(default_factory=list)
step_number: int | None = None
evaluation_previous_goal: str | None = None
memory: str | None = None
next_goal: str | None = None
action_results: str | None = None
error: str | None = None
system_message: str | None = None
model_config = ConfigDict(arbitrary_types_allowed=True)
def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
"""Add message with metadata to history"""
if position is None:
self.messages.append(ManagedMessage(message=message, metadata=metadata))
def model_post_init(self, __context) -> None:
"""Validate that error and system_message are not both provided"""
if self.error is not None and self.system_message is not None:
raise ValueError('Cannot have both error and system_message at the same time')
def to_string(self) -> str:
"""Get string representation of the history item"""
step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
if self.error:
return f"""<{step_str}>
{self.error}
</{step_str}>"""
elif self.system_message:
return f"""<sys>
{self.system_message}
</sys>"""
else:
self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
content_parts = [
f'Evaluation of Previous Step: {self.evaluation_previous_goal}',
f'Memory: {self.memory}',
f'Next Goal: {self.next_goal}',
]
if self.action_results:
content_parts.append(self.action_results)
content = '\n'.join(content_parts)
return f"""<{step_str}>
{content}
</{step_str}>"""
class MessageHistory(BaseModel):
"""History of messages"""
messages: list[BaseMessage] = Field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
def add_message(self, message: BaseMessage, position: int | None = None) -> None:
"""Add message to history"""
if position is None:
self.messages.append(message)
else:
self.messages.insert(position, message)
def get_messages(self) -> list[BaseMessage]:
"""Get all messages"""
return [m.message for m in self.messages]
return self.messages
def remove_last_state_message(self) -> None:
"""Remove last state message from history"""
if len(self.messages) > 2 and isinstance(self.messages[-1].message, UserMessage):
if len(self.messages) > 2 and isinstance(self.messages[-1], UserMessage):
self.messages.pop()
@@ -58,7 +89,9 @@ class MessageManagerState(BaseModel):
history: MessageHistory = Field(default_factory=MessageHistory)
tool_id: int = 1
agent_history_description: str = '<s>Agent initialized</s>\n'
agent_history_items: list[HistoryItem] = Field(
default_factory=lambda: [HistoryItem(step_number=0, system_message='Agent initialized')]
)
read_state_description: str = ''
model_config = ConfigDict(arbitrary_types_allowed=True)

View File

@@ -169,6 +169,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
],
max_actions_per_step: int = 10,
use_thinking: bool = True,
max_history_items: int = 40,
page_extraction_llm: BaseChatModel | None = None,
planner_llm: BaseChatModel | None = None,
planner_interval: int = 1, # Run planner every N steps
@@ -235,12 +236,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
available_file_paths=available_file_paths,
include_attributes=include_attributes,
max_actions_per_step=max_actions_per_step,
use_thinking=use_thinking,
max_history_items=max_history_items,
page_extraction_llm=page_extraction_llm,
planner_llm=planner_llm,
planner_interval=planner_interval,
is_planner_reasoning=is_planner_reasoning,
extend_planner_system_message=extend_planner_system_message,
use_thinking=use_thinking,
calculate_cost=calculate_cost,
)
@@ -318,6 +320,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
include_attributes=self.settings.include_attributes,
message_context=self.settings.message_context,
sensitive_data=sensitive_data,
max_history_items=self.settings.max_history_items,
)
if isinstance(browser, BrowserSession):

View File

@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
Action Results: Your actions and their results
</step_{{step_number}}>
and system messages wrapped in <s> tag.
and system messages wrapped in <sys> tag.
</agent_history>
<user_request>
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in
Examples:
[33]<div>User form</div>
\t*[35]*<button aria-label='Submit form'>Submit</button>
\t<new>[35]</new><button aria-label='Submit form'>Submit</button>
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements with \* are new elements that were added after the previous step (if url has not changed)
- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Pure text elements without [] are not interactive.
</browser_state>
@@ -90,15 +90,12 @@ Strictly follow these rules while using the browser and navigating the web:
<file_system>
- You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
- Your file system is initialized with two files:
1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
- You can read, write, and append to files.
- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
- Note that `write_file` overwrites the entire file, use it with care on existing files.
- When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You DON'T HAVE write access to these files. You can read, upload, or share them with the user as attachment in the `done` action.
- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
- If the task is really long, initialize a `results.md` file to accumulate your results.
- DO NOT use the file system if the task is less than 5 steps!
</file_system>

View File

@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
Action Results: Your actions and their results
</step_{{step_number}}>
and system messages wrapped in <s> tag.
and system messages wrapped in <sys> tag.
</agent_history>
<user_request>
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in
Examples:
[33]<div>User form</div>
\t*[35]*<button aria-label='Submit form'>Submit</button>
\t<new>[35]</new><button aria-label='Submit form'>Submit</button>
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements with \* are new elements that were added after the previous step (if url has not changed)
- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Pure text elements without [] are not interactive.
</browser_state>
@@ -90,15 +90,12 @@ Strictly follow these rules while using the browser and navigating the web:
<file_system>
- You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
- Your file system is initialized with two files:
1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
- You can read, write, and append to files.
- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
- Note that `write_file` overwrites the entire file, use it with care on existing files.
- When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
- If the task is really long, initialize a `results.md` file to accumulate your results.
- DO NOT use the file system if the task is less than 5 steps!
</file_system>

View File

@@ -54,6 +54,7 @@ class AgentSettings(BaseModel):
]
max_actions_per_step: int = 10
use_thinking: bool = True
max_history_items: int = 40
page_extraction_llm: BaseChatModel | None = None
planner_llm: BaseChatModel | None = None

View File

@@ -25,7 +25,7 @@ os.environ['PW_TEST_SCREENSHOT_NO_FONTS_READY'] = '1' # https://github.com/micr
import anyio
import psutil
from playwright._impl._api_structures import FloatRect, ViewportSize
from playwright._impl._api_structures import ViewportSize
from pydantic import AliasChoices, BaseModel, ConfigDict, Field, InstanceOf, PrivateAttr, model_validator
from uuid_extensions import uuid7str
@@ -248,12 +248,12 @@ class BrowserSession(BaseModel):
return self._logger
def __repr__(self) -> str:
is_copy = '©' if self._original_browser_session else '1'
return f'BrowserSession🆂 {self.id[-4:]}{is_copy}{str(id(self))[-2:]} ({self._connection_str}, profile={self.browser_profile})'
is_copy = '©' if self._original_browser_session else '#'
return f'BrowserSession🆂 {self.id[-4:]} {is_copy}{str(id(self))[-2:]} ({self._connection_str}, profile={self.browser_profile})'
def __str__(self) -> str:
is_copy = '©' if self._original_browser_session else '1'
return f'BrowserSession🆂 {self.id[-4:]}{is_copy}{str(id(self))[-2:]} 🅟 {str(id(self.agent_current_page))[-2:]}'
is_copy = '©' if self._original_browser_session else '#'
return f'BrowserSession🆂 {self.id[-4:]} {is_copy}{str(id(self))[-2:]} 🅟 {str(id(self.agent_current_page))[-2:]}'
# better to force people to get it from the right object, "only one way to do it" is better python
# def __getattr__(self, key: str) -> Any:
@@ -725,7 +725,7 @@ class BrowserSession(BaseModel):
full_page=False,
# scale='css',
timeout=self.browser_profile.default_timeout or 30000,
clip=FloatRect(**clip) if clip else None,
# clip=FloatRect(**clip) if clip else None,
animations='allow',
caret='initial',
)
@@ -2714,27 +2714,28 @@ class BrowserSession(BaseModel):
# This prevents timeouts on very long pages
# 1. Get current viewport and page dimensions including scroll position
dimensions = await page.evaluate("""() => {
return {
width: window.innerWidth,
height: window.innerHeight,
pageHeight: document.documentElement.scrollHeight,
devicePixelRatio: window.devicePixelRatio || 1,
scrollX: window.pageXOffset || document.documentElement.scrollLeft || 0,
scrollY: window.pageYOffset || document.documentElement.scrollTop || 0
};
}""")
# dimensions = await page.evaluate("""() => {
# return {
# width: window.innerWidth,
# height: window.innerHeight,
# pageWidth: document.documentElement.scrollWidth,
# pageHeight: document.documentElement.scrollHeight,
# devicePixelRatio: window.devicePixelRatio || 1,
# scrollX: window.pageXOffset || document.documentElement.scrollLeft || 0,
# scrollY: window.pageYOffset || document.documentElement.scrollTop || 0
# };
# }""")
# When full_page=False, screenshot captures the current viewport
# The clip parameter uses viewport coordinates (0,0 is top-left of viewport)
# We just need to ensure the clip dimensions don't exceed our maximums
# clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH)
# clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT)
# Take screenshot using our retry-decorated method
return await self._take_screenshot_hybrid(
page,
clip={
'x': dimensions['scrollX'],
'y': dimensions['scrollY'],
'width': min(dimensions['width'], MAX_SCREENSHOT_WIDTH),
'height': min(dimensions['height'], MAX_SCREENSHOT_HEIGHT),
},
)
# Don't pass clip parameter - let Playwright capture the full viewport
# It will automatically handle cases where viewport extends beyond page content
return await self._take_screenshot_hybrid(page)
except Exception as e:
self.logger.error(f'❌ Failed to take screenshot after retries: {type(e).__name__}: {e}')
raise

View File

@@ -855,8 +855,8 @@ class BrowserUseApp(App):
# Extract original task(s)
original_tasks = []
for msg in message_history:
if hasattr(msg, 'message') and hasattr(msg.message, 'content'):
content = msg.message.content
if hasattr(msg, 'content'):
content = msg.content
if isinstance(content, str) and 'Your ultimate task is:' in content:
task_text = content.split('"""')[1].strip()
original_tasks.append(task_text)

View File

@@ -331,10 +331,12 @@ class Controller(Generic[Context]):
@self.registry.action(
"""Extract structured, semantic data (e.g. product description, price, all information about XYZ) from the current webpage based on a textual query.
Only use this for extracting info from a single product/article page, not for entire listings or search results pages.
Set extract_links=True ONLY if your query requires extracting links/URLs from the page.
""",
)
async def extract_structured_data(
query: str,
extract_links: bool,
page: Page,
page_extraction_llm: BaseChatModel,
file_system: FileSystem,
@@ -344,13 +346,8 @@ Only use this for extracting info from a single product/article page, not for en
import markdownify
strip = []
include_links = False
lower_query = query.lower()
url_keywords = ['url', 'links']
if any(keyword in lower_query for keyword in url_keywords):
include_links = True
if not include_links:
if not extract_links:
strip = ['a', 'img']
# Run markdownify in a thread pool to avoid blocking the event loop

View File

@@ -15,13 +15,11 @@ class GoToUrlAction(BaseModel):
class ClickElementAction(BaseModel):
index: int
xpath: str | None = None
class InputTextAction(BaseModel):
index: int
text: str
xpath: str | None = None
class DoneAction(BaseModel):

View File

@@ -195,7 +195,7 @@ class DOMElementNode(DOMBaseNode):
# Build the line
if node.is_new:
highlight_indicator = f'*[{node.highlight_index}]*'
highlight_indicator = f'<new>[{node.highlight_index}]</new>'
else:
highlight_indicator = f'[{node.highlight_index}]'

View File

@@ -124,7 +124,7 @@ class FileSystem:
self.files = {}
if create_default_files:
self.default_files = ['results.md', 'todo.md']
self.default_files = ['todo.md']
self._create_default_files()
self.extracted_content_count = 0

View File

@@ -6,6 +6,7 @@ import httpx
from openai import APIConnectionError, APIStatusError, AsyncOpenAI, RateLimitError
from openai.types.chat.chat_completion import ChatCompletion
from openai.types.shared.chat_model import ChatModel
from openai.types.shared_params.reasoning_effort import ReasoningEffort
from openai.types.shared_params.response_format_json_schema import JSONSchema, ResponseFormatJSONSchema
from pydantic import BaseModel
@@ -18,6 +19,8 @@ from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
T = TypeVar('T', bound=BaseModel)
ReasoningModels: list[ChatModel | str] = ['o4-mini', 'o3', 'o3-mini', 'o1', 'o1-pro', 'o3-pro']
@dataclass
class ChatOpenAI(BaseChatModel):
@@ -33,6 +36,7 @@ class ChatOpenAI(BaseChatModel):
# Model params
temperature: float | None = None
reasoning_effort: ReasoningEffort = 'low'
# Client initialization parameters
api_key: str | None = None
@@ -132,10 +136,19 @@ class ChatOpenAI(BaseChatModel):
openai_messages = OpenAIMessageSerializer.serialize_messages(messages)
try:
reasoning_effort_dict: dict = {}
if self.model in ReasoningModels:
reasoning_effort_dict = {
'reasoning_effort': self.reasoning_effort,
}
if output_format is None:
# Return string response
response = await self.get_client().chat.completions.create(
model=self.model, messages=openai_messages, temperature=self.temperature
model=self.model,
messages=openai_messages,
temperature=self.temperature,
**reasoning_effort_dict,
)
usage = self._get_usage(response)
@@ -157,6 +170,7 @@ class ChatOpenAI(BaseChatModel):
messages=openai_messages,
temperature=self.temperature,
response_format=ResponseFormatJSONSchema(json_schema=response_format, type='json_schema'),
**reasoning_effort_dict,
)
if response.choices[0].message.content is None:

View File

@@ -578,6 +578,7 @@ class TaskResult:
task: Any
max_steps: int
laminar_link: str | None = None
github_workflow_url: str | None = None
completed_stages: set[Stage] = field(default_factory=set)
stage_data: dict[Stage, Any] = field(default_factory=dict)
errors: list = field(default_factory=list)
@@ -619,6 +620,7 @@ class TaskResult:
'critical_error': self.critical_error,
'server_save_failed': self.server_save_failed,
'laminarTaskLink': self.laminar_link,
'githubWorkflowUrl': self.github_workflow_url,
}
# Add task execution data if available
@@ -759,8 +761,8 @@ SUPPORTED_MODELS = {
'gemini-1.5-flash': {'provider': 'google', 'model_name': 'gemini-1.5-flash-latest', 'api_key_env': 'GEMINI_API_KEY'},
'gemini-2.0-flash-lite': {'provider': 'google', 'model_name': 'gemini-2.0-flash-lite', 'api_key_env': 'GEMINI_API_KEY'},
'gemini-2.0-flash': {'provider': 'google', 'model_name': 'gemini-2.0-flash', 'api_key_env': 'GEMINI_API_KEY'},
'gemini-2.5-pro': {'provider': 'google', 'model_name': 'gemini-2.5-pro-preview-03-25', 'api_key_env': 'GEMINI_API_KEY'},
'gemini-2.5-flash': {'provider': 'google', 'model_name': 'gemini-2.5-flash-latest', 'api_key_env': 'GEMINI_API_KEY'},
'gemini-2.5-pro': {'provider': 'google', 'model_name': 'gemini-2.5-pro', 'api_key_env': 'GEMINI_API_KEY'},
'gemini-2.5-flash': {'provider': 'google', 'model_name': 'gemini-2.5-flash', 'api_key_env': 'GEMINI_API_KEY'},
'gemini-2.5-pro-preview-05-06': {
'provider': 'google',
'model_name': 'gemini-2.5-pro-preview-05-06',
@@ -774,6 +776,7 @@ SUPPORTED_MODELS = {
# OpenAI
'gpt-4.1': {'provider': 'openai', 'model_name': 'gpt-4.1-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
'gpt-4.1-mini': {'provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
'gpt-o3': {'provider': 'openai', 'model_name': 'o3-2025-04-16', 'api_key_env': 'OPENAI_API_KEY'},
'gpt-4.1-nano': {'provider': 'openai', 'model_name': 'gpt-4.1-nano-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
'gpt-4o': {'provider': 'openai', 'model_name': 'gpt-4o', 'api_key_env': 'OPENAI_API_KEY'},
'gpt-4o-mini': {'provider': 'openai', 'model_name': 'gpt-4o-mini', 'api_key_env': 'OPENAI_API_KEY'},
@@ -924,7 +927,7 @@ def get_llm(model_name: str):
case 'openai':
kwargs = {'model': config['model_name'], 'temperature': 0.0}
# Must set temperatue=1 if model is gpt-o4-mini
if model_name == 'gpt-o4-mini':
if model_name in ['gpt-o4-mini', 'gpt-o3']:
kwargs['temperature'] = 1
if api_key:
kwargs['api_key'] = api_key
@@ -1289,8 +1292,9 @@ async def judge_task_result(model, task_folder: Path, score_threshold: float = 3
try:
# Run comprehensive judge evaluation
comprehensive_result = await evaluate_task_with_comprehensive_judge(
task_folder=task_folder, model=model, max_images=10
comprehensive_result = await asyncio.wait_for(
evaluate_task_with_comprehensive_judge(task_folder=task_folder, model=model, max_images=10),
timeout=180, # 3 minutes max for evaluation
)
if comprehensive_result.get('error'):
@@ -1649,6 +1653,7 @@ async def run_task_with_semaphore(
headless: bool,
use_vision: bool,
semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument
github_workflow_url: str | None = None,
use_serp: bool = False,
use_anchor: bool = False,
enable_memory: bool = False,
@@ -1721,7 +1726,9 @@ async def run_task_with_semaphore(
logger.debug(f'Task {task.task_id}: No Laminar run ID available, skipping datapoint creation')
# Initialize task result and basic setup
task_result = TaskResult(task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link)
task_result = TaskResult(
task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link, github_workflow_url
)
task_folder = Path(f'saved_trajectories/{task.task_id}')
@@ -1917,7 +1924,13 @@ async def run_task_with_semaphore(
# Create minimal task result for server reporting
try:
task_result = TaskResult(
task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link
task.task_id,
run_id,
task.confirmed_task,
task,
max_steps_per_task,
laminar_task_link,
github_workflow_url,
)
task_result.mark_critical_error(f'Initialization failed: {str(init_error)}')
except Exception as result_error:
@@ -1978,6 +1991,7 @@ async def run_multiple_tasks(
convex_url: str,
secret_key: str,
eval_model: BaseChatModel,
github_workflow_url: str | None = None,
max_parallel_runs: int = 3,
max_steps_per_task: int = 25,
start_index: int = 0,
@@ -2063,6 +2077,7 @@ async def run_multiple_tasks(
headless=headless,
use_vision=use_vision,
semaphore_runs=semaphore_runs, # Pass the semaphore
github_workflow_url=github_workflow_url,
use_serp=use_serp,
use_anchor=use_anchor,
enable_memory=enable_memory,
@@ -2326,6 +2341,7 @@ async def run_evaluation_pipeline(
convex_url: str,
secret_key: str,
eval_model: BaseChatModel,
github_workflow_url: str | None = None,
max_parallel_runs: int = 3,
max_steps_per_task: int = 25,
start_index: int = 0,
@@ -2379,6 +2395,7 @@ async def run_evaluation_pipeline(
convex_url=convex_url,
secret_key=secret_key,
eval_model=eval_model,
github_workflow_url=github_workflow_url,
max_parallel_runs=max_parallel_runs,
max_steps_per_task=max_steps_per_task,
start_index=start_index,
@@ -2463,6 +2480,7 @@ if __name__ == '__main__':
parser.add_argument('--use-mind2web-judge', action='store_true', help='Use original judge')
parser.add_argument('--no-thinking', action='store_true', help='Disable thinking in agent system prompt')
parser.add_argument('--use-anchor', action='store_true', help='Use anchor to navigate to the page')
parser.add_argument('--github-workflow-url', type=str, default=None, help='GitHub workflow URL for tracking')
# Single task mode arguments
parser.add_argument('--task-text', type=str, default=None, help='Task description for single task mode')
@@ -2705,6 +2723,7 @@ if __name__ == '__main__':
convex_url=convex_url,
secret_key=secret_key,
eval_model=eval_model,
github_workflow_url=args.github_workflow_url,
max_parallel_runs=parallel_runs,
max_steps_per_task=args.max_steps,
start_index=start_index,

View File

@@ -12,9 +12,16 @@ from browser_use import Agent
from browser_use.browser import BrowserProfile
from browser_use.llm import ChatOpenAI
try:
from lmnr import Laminar
Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
except Exception as e:
print(f'Error initializing Laminar: {e}')
# Initialize the model
llm = ChatOpenAI(
model='gpt-4o',
model='gpt-4.1',
temperature=0.0,
)
# Simple case: the model will see x_name and x_password, but never the actual values.
@@ -35,7 +42,7 @@ sensitive_data: dict[str, str | dict[str, str]] = {
'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
}
# Update task to use one of the credentials above
task = 'Go to example.com and login with company_username and company_password'
task = 'Go to google.com and put the login information in the search bar.'
# Always set allowed_domains when using sensitive_data for security
from browser_use.browser.session import BrowserSession

View File

@@ -17,7 +17,6 @@ dependencies = [
"google-api-core>=2.25.0",
"httpx>=0.28.1",
"markdownify==1.1.0",
"mem0ai>=0.1.106",
"patchright>=1.52.5",
"playwright>=1.52.0",
"portalocker>=2.7.0,<3.0.0",

View File

@@ -2,6 +2,7 @@
Test that screenshots work correctly in headless browser mode.
"""
import asyncio
import base64
from browser_use.browser import BrowserProfile, BrowserSession
@@ -193,7 +194,7 @@ class TestHeadlessScreenshots:
# Take screenshots from all sessions at the same time
print('Taking screenshots from all 10 sessions simultaneously...')
screenshot_tasks = [session.take_screenshot(full_page=True) for session in browser_sessions]
screenshot_tasks = [session.take_screenshot() for session in browser_sessions]
screenshots = await asyncio.gather(*screenshot_tasks)
# Verify all screenshots are valid
@@ -221,9 +222,7 @@ class TestHeadlessScreenshots:
# Also test taking regular (viewport) screenshots in parallel
print('Taking viewport screenshots from all sessions simultaneously...')
viewport_screenshots = await asyncio.gather(
*[session.take_screenshot(full_page=False) for session in browser_sessions]
)
viewport_screenshots = await asyncio.gather(*[session.take_screenshot() for session in browser_sessions])
# Verify viewport screenshots
for i, screenshot in enumerate(viewport_screenshots):
@@ -244,3 +243,69 @@ class TestHeadlessScreenshots:
for i, result in enumerate(results):
if isinstance(result, Exception):
print(f'Warning: Session {i} kill raised exception: {type(result).__name__}: {result}')
async def test_screenshot_at_bottom_of_page(self, httpserver):
"""Test screenshot capture when scrolled to bottom of page (regression test for clipping issue)"""
browser_session = BrowserSession(
browser_profile=BrowserProfile(
headless=True,
user_data_dir=None,
keep_alive=False,
)
)
try:
await browser_session.start()
# Create a page with scrollable content
httpserver.expect_request('/scrollable').respond_with_data(
"""<html>
<head><title>Scrollable Page Test</title></head>
<body style="margin: 0; padding: 0;">
<div style="height: 3000px; background: linear-gradient(to bottom, red, yellow, green, blue);">
<div style="position: absolute; top: 0; left: 10px; font-size: 24px;">Top of page</div>
<div style="position: absolute; top: 50%; left: 10px; font-size: 24px;">Middle of page</div>
<div style="position: absolute; bottom: 10px; left: 10px; font-size: 24px;">Bottom of page</div>
</div>
</body>
</html>""",
content_type='text/html',
)
# Navigate to test page
await browser_session.navigate(httpserver.url_for('/scrollable'))
page = browser_session.agent_current_page
assert page is not None
# Test 1: Screenshot at top of page (should work)
screenshot_top = await browser_session.take_screenshot()
assert screenshot_top is not None
assert len(base64.b64decode(screenshot_top)) > 5000
# Test 2: Screenshot at middle of page
await page.evaluate('window.scrollTo(0, document.body.scrollHeight / 2)')
await asyncio.sleep(0.1) # Wait for scroll
screenshot_middle = await browser_session.take_screenshot()
assert screenshot_middle is not None
assert len(base64.b64decode(screenshot_middle)) > 5000
# Test 3: Screenshot at bottom of page (this was failing with clipping error)
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
await asyncio.sleep(0.1) # Wait for scroll
# This should not raise "Clipped area is either empty or outside the resulting image" error
screenshot_bottom = await browser_session.take_screenshot()
assert screenshot_bottom is not None
assert len(base64.b64decode(screenshot_bottom)) > 5000
# Test 4: Screenshot when scrolled beyond page bottom (edge case)
await page.evaluate('window.scrollTo(0, document.body.scrollHeight + 1000)')
await asyncio.sleep(0.1)
screenshot_beyond = await browser_session.take_screenshot()
assert screenshot_beyond is not None
assert len(base64.b64decode(screenshot_beyond)) > 5000
print('✅ All screenshot positions tested successfully!')
finally:
await browser_session.stop()

View File

@@ -138,14 +138,11 @@ class TestFileSystem:
assert fs.data_dir.name == DEFAULT_FILE_SYSTEM_PATH
# Check default files are created
assert 'results.md' in fs.files
assert 'todo.md' in fs.files
assert len(fs.files) == 2
assert len(fs.files) == 1
# Check files exist on disk
results_path = fs.data_dir / 'results.md'
todo_path = fs.data_dir / 'todo.md'
assert results_path.exists()
assert todo_path.exists()
def test_filesystem_without_default_files(self, empty_filesystem):
@@ -199,12 +196,6 @@ class TestFileSystem:
"""Test getting files from the filesystem."""
fs = temp_filesystem
# Get existing file
results_file = fs.get_file('results.md')
assert results_file is not None
assert isinstance(results_file, MarkdownFile)
assert results_file.name == 'results'
# Get non-existent file
non_existent = fs.get_file('nonexistent.md')
assert non_existent is None
@@ -218,16 +209,15 @@ class TestFileSystem:
fs = temp_filesystem
files = fs.list_files()
assert 'results.md' in files
assert 'todo.md' in files
assert len(files) == 2
assert len(files) == 1
def test_display_file(self, temp_filesystem):
"""Test displaying file content."""
fs = temp_filesystem
# Display existing file
content = fs.display_file('results.md')
content = fs.display_file('todo.md')
assert content == '' # Default files are empty
# Display non-existent file
@@ -243,8 +233,8 @@ class TestFileSystem:
fs = temp_filesystem
# Read existing empty file
result = fs.read_file('results.md')
expected = 'Read from file results.md.\n<content>\n\n</content>'
result = fs.read_file('todo.md')
expected = 'Read from file todo.md.\n<content>\n\n</content>'
assert result == expected
# Read non-existent file
@@ -326,17 +316,6 @@ class TestFileSystem:
assert content1 == 'First extracted content'
assert content2 == 'Second extracted content'
async def test_describe_empty_files(self, temp_filesystem):
"""Test describing filesystem with empty files."""
fs = temp_filesystem
description = fs.describe()
# Should contain results.md but not todo.md (excluded from description)
assert 'results.md' in description
assert 'todo.md' not in description
assert '[empty file]' in description
async def test_describe_with_content(self, temp_filesystem):
"""Test describing filesystem with files containing content."""
fs = temp_filesystem
@@ -392,15 +371,8 @@ class TestFileSystem:
assert isinstance(state, FileSystemState)
assert state.base_dir == str(fs.base_dir)
assert state.extracted_content_count == 0
assert 'results.md' in state.files
assert 'todo.md' in state.files
# Check file data structure
results_data = state.files['results.md']
assert results_data['type'] == 'MarkdownFile'
assert 'data' in results_data
assert results_data['data']['name'] == 'results'
async def test_from_state(self, temp_filesystem):
"""Test restoring filesystem from state."""
fs = temp_filesystem
@@ -503,7 +475,6 @@ class TestFileSystemEdgeCases:
# Custom file should be gone, default files should exist
assert not custom_file.exists()
assert (fs2.data_dir / 'results.md').exists()
assert (fs2.data_dir / 'todo.md').exists()
fs2.nuke()