diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index f799e6b54..0ce5a61e2 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -7,7 +7,9 @@ on:
jobs:
run_evaluation:
- runs-on: ubuntu-latest
+ runs-on:
+ group: eval
+ labels: eval-2-core-500
timeout-minutes: 360
env:
IN_DOCKER: 'true'
@@ -104,6 +106,13 @@ jobs:
ps aux | wc -l
echo "================================="
+ - name: Construct GitHub Workflow URL
+ id: github_url
+ run: |
+ GITHUB_WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+ echo "GITHUB_WORKFLOW_URL=$GITHUB_WORKFLOW_URL" >> $GITHUB_OUTPUT
+ echo "::notice title=Workflow URL::Workflow URL: $GITHUB_WORKFLOW_URL"
+
- name: Construct eval command
id: eval_command
run: |
@@ -216,6 +225,9 @@ jobs:
[[ -n "$TASK_TEXT" ]] && CMD_ARGS+=("--task-text" "$TASK_TEXT")
[[ -n "$TASK_WEBSITE" ]] && CMD_ARGS+=("--task-website" "$TASK_WEBSITE")
+ # Add GitHub workflow URL
+ [[ -n "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}" ]] && CMD_ARGS+=("--github-workflow-url" "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}")
+
# Convert array to command string with proper escaping
printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"
diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
index 32c541d97..b48d1aff1 100644
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -4,8 +4,7 @@ import json
import logging
from browser_use.agent.message_manager.views import (
- MessageMetadata,
- SupportedMessageTypes,
+ HistoryItem,
)
from browser_use.agent.prompts import AgentMessagePrompt
from browser_use.agent.views import (
@@ -106,6 +105,7 @@ class MessageManager:
include_attributes: list[str] | None = None,
message_context: str | None = None,
sensitive_data: dict[str, str | dict[str, str]] | None = None,
+ max_history_items: int | None = None,
):
self.task = task
self.state = state
@@ -114,6 +114,9 @@ class MessageManager:
self.sensitive_data_description = ''
self.available_file_paths = available_file_paths
self.use_thinking = use_thinking
+ self.max_history_items = max_history_items
+
+ assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'
# Store settings as direct attributes instead of in a settings object
self.include_attributes = include_attributes or []
@@ -124,16 +127,45 @@ class MessageManager:
if len(self.state.history.messages) == 0:
self._init_messages()
+ @property
+ def agent_history_description(self) -> str:
+ """Build agent history description from list of items, respecting max_history_items limit"""
+ if self.max_history_items is None:
+ # Include all items
+ return '\n'.join(item.to_string() for item in self.state.agent_history_items)
+
+ total_items = len(self.state.agent_history_items)
+
+ # If we have fewer items than the limit, just return all items
+ if total_items <= self.max_history_items:
+ return '\n'.join(item.to_string() for item in self.state.agent_history_items)
+
+ # We have more items than the limit, so we need to omit some
+ omitted_count = total_items - self.max_history_items
+
+ # Show first item + omitted message + most recent (max_history_items - 1) items
+ # The omitted message doesn't count against the limit, only real history items do
+ recent_items_count = self.max_history_items - 1 # -1 for first item
+
+ items_to_include = [
+ self.state.agent_history_items[0].to_string(), # Keep first item (initialization)
+ f'[... {omitted_count} previous steps omitted...]',
+ ]
+ # Add most recent items
+ items_to_include.extend([item.to_string() for item in self.state.agent_history_items[-recent_items_count:]])
+
+ return '\n'.join(items_to_include)
+
def _init_messages(self) -> None:
"""Initialize the message history with system message, context, task, and other initial messages"""
- self._add_message_with_type(self.system_prompt, message_type='init')
+ self._add_message_with_type(self.system_prompt)
placeholder_message = UserMessage(
content='\nHere is an example output of thinking and tool call. You can use it as a reference but do not copy it exactly.',
cache=True,
)
# placeholder_message = HumanMessage(content='Example output:')
- self._add_message_with_type(placeholder_message, message_type='init')
+ self._add_message_with_type(placeholder_message)
# Create base example content
example_content = {
@@ -173,18 +205,18 @@ After writing todo.md, I can also initialize a github.md file to accumulate the
The file system actions do not change the browser state, so I can also click on the bytedance/UI-TARS-desktop (index [4]) to start collecting information."""
example_tool_call_1 = AssistantMessage(content=json.dumps(example_content), cache=True)
- self._add_message_with_type(example_tool_call_1, message_type='init')
+ self._add_message_with_type(example_tool_call_1)
self._add_message_with_type(
UserMessage(
content='Data written to todo.md.\nData written to github.md.\nClicked element with index 4.\n',
cache=True,
),
- message_type='init',
)
def add_new_task(self, new_task: str) -> None:
self.task = new_task
- self.state.agent_history_description += f'\nUser updated to: {new_task}\n'
+ task_update_item = HistoryItem(system_message=f'User updated to: {new_task}')
+ self.state.agent_history_items.append(task_update_item)
def _update_agent_history_description(
self,
@@ -196,7 +228,7 @@ The file system actions do not change the browser state, so I can also click on
if result is None:
result = []
- step_number = step_info.step_number if step_info else 'unknown'
+ step_number = step_info.step_number if step_info else None
self.state.read_state_description = ''
@@ -220,23 +252,23 @@ The file system actions do not change the browser state, so I can also click on
if action_results:
action_results = f'Action Results:\n{action_results}'
- action_results = action_results.strip('\n')
+ action_results = action_results.strip('\n') if action_results else None
- # Handle case where model_output is None (e.g., parsing failed)
+ # Build the history item
if model_output is None:
- if isinstance(step_number, int) and step_number > 0:
- self.state.agent_history_description += f"""
-Agent failed to output in the right format.
-
-"""
+ # Only add error history item if we have a valid step number
+ if step_number is not None and step_number > 0:
+ history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
+ self.state.agent_history_items.append(history_item)
else:
- self.state.agent_history_description += f"""
-Evaluation of Previous Step: {model_output.current_state.evaluation_previous_goal}
-Memory: {model_output.current_state.memory}
-Next Goal: {model_output.current_state.next_goal}
-{action_results}
-
-"""
+ history_item = HistoryItem(
+ step_number=step_number,
+ evaluation_previous_goal=model_output.current_state.evaluation_previous_goal,
+ memory=model_output.current_state.memory,
+ next_goal=model_output.current_state.next_goal,
+ action_results=action_results,
+ )
+ self.state.agent_history_items.append(history_item)
def _get_sensitive_data_description(self, current_page_url) -> str:
sensitive_data = self.sensitive_data
@@ -284,7 +316,7 @@ Next Goal: {model_output.current_state.next_goal}
state_message = AgentMessagePrompt(
browser_state_summary=browser_state_summary,
file_system=self.file_system,
- agent_history_description=self.state.agent_history_description,
+ agent_history_description=self.agent_history_description,
read_state_description=self.state.read_state_description,
task=self.task,
include_attributes=self.include_attributes,
@@ -346,16 +378,15 @@ Next Goal: {model_output.current_state.next_goal}
# Log message history for debugging
logger.debug(self._log_history_lines())
- self.last_input_messages = [m.message for m in self.state.history.messages]
+ self.last_input_messages = list(self.state.history.messages)
return self.last_input_messages
def _add_message_with_type(
self,
message: BaseMessage,
position: int | None = None,
- message_type: SupportedMessageTypes | None = None,
) -> None:
- """Add message with token count metadata
+ """Add message to history
position: None for last, -1 for second last, etc.
"""
@@ -363,8 +394,7 @@ Next Goal: {model_output.current_state.next_goal}
if self.sensitive_data:
message = self._filter_sensitive_data(message)
- metadata = MessageMetadata(message_type=message_type)
- self.state.history.add_message(message, metadata, position)
+ self.state.history.add_message(message, position)
@time_execution_sync('--filter_sensitive_data')
def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage:
diff --git a/browser_use/agent/message_manager/views.py b/browser_use/agent/message_manager/views.py
index 9e605fa34..351a938f2 100644
--- a/browser_use/agent/message_manager/views.py
+++ b/browser_use/agent/message_manager/views.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING
from pydantic import BaseModel, ConfigDict, Field
@@ -13,43 +13,74 @@ if TYPE_CHECKING:
pass
-SupportedMessageTypes = Literal['init', 'memory']
+class HistoryItem(BaseModel):
+ """Represents a single agent history item with its data and string representation"""
-
-class MessageMetadata(BaseModel):
- """Metadata for a message"""
-
- message_type: SupportedMessageTypes | None = None
-
-
-class ManagedMessage(BaseModel):
- """A message with its metadata"""
-
- message: BaseMessage
- metadata: MessageMetadata = Field(default_factory=MessageMetadata)
-
-
-class MessageHistory(BaseModel):
- """History of messages with metadata"""
-
- messages: list[ManagedMessage] = Field(default_factory=list)
+ step_number: int | None = None
+ evaluation_previous_goal: str | None = None
+ memory: str | None = None
+ next_goal: str | None = None
+ action_results: str | None = None
+ error: str | None = None
+ system_message: str | None = None
model_config = ConfigDict(arbitrary_types_allowed=True)
- def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
- """Add message with metadata to history"""
- if position is None:
- self.messages.append(ManagedMessage(message=message, metadata=metadata))
+ def model_post_init(self, __context) -> None:
+ """Validate that error and system_message are not both provided"""
+ if self.error is not None and self.system_message is not None:
+ raise ValueError('Cannot have both error and system_message at the same time')
+
+ def to_string(self) -> str:
+ """Get string representation of the history item"""
+ step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
+
+ if self.error:
+ return f"""<{step_str}>
+{self.error}
+{step_str}>"""
+ elif self.system_message:
+ return f"""
+{self.system_message}
+"""
else:
- self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
+ content_parts = [
+ f'Evaluation of Previous Step: {self.evaluation_previous_goal}',
+ f'Memory: {self.memory}',
+ f'Next Goal: {self.next_goal}',
+ ]
+
+ if self.action_results:
+ content_parts.append(self.action_results)
+
+ content = '\n'.join(content_parts)
+
+ return f"""<{step_str}>
+{content}
+{step_str}>"""
+
+
+class MessageHistory(BaseModel):
+ """History of messages"""
+
+ messages: list[BaseMessage] = Field(default_factory=list)
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ def add_message(self, message: BaseMessage, position: int | None = None) -> None:
+ """Add message to history"""
+ if position is None:
+ self.messages.append(message)
+ else:
+ self.messages.insert(position, message)
def get_messages(self) -> list[BaseMessage]:
"""Get all messages"""
- return [m.message for m in self.messages]
+ return self.messages
def remove_last_state_message(self) -> None:
"""Remove last state message from history"""
- if len(self.messages) > 2 and isinstance(self.messages[-1].message, UserMessage):
+ if len(self.messages) > 2 and isinstance(self.messages[-1], UserMessage):
self.messages.pop()
@@ -58,7 +89,9 @@ class MessageManagerState(BaseModel):
history: MessageHistory = Field(default_factory=MessageHistory)
tool_id: int = 1
- agent_history_description: str = 'Agent initialized\n'
+ agent_history_items: list[HistoryItem] = Field(
+ default_factory=lambda: [HistoryItem(step_number=0, system_message='Agent initialized')]
+ )
read_state_description: str = ''
model_config = ConfigDict(arbitrary_types_allowed=True)
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
index d5120923e..e78115fc4 100644
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -169,6 +169,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
],
max_actions_per_step: int = 10,
use_thinking: bool = True,
+ max_history_items: int = 40,
page_extraction_llm: BaseChatModel | None = None,
planner_llm: BaseChatModel | None = None,
planner_interval: int = 1, # Run planner every N steps
@@ -235,12 +236,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
available_file_paths=available_file_paths,
include_attributes=include_attributes,
max_actions_per_step=max_actions_per_step,
+ use_thinking=use_thinking,
+ max_history_items=max_history_items,
page_extraction_llm=page_extraction_llm,
planner_llm=planner_llm,
planner_interval=planner_interval,
is_planner_reasoning=is_planner_reasoning,
extend_planner_system_message=extend_planner_system_message,
- use_thinking=use_thinking,
calculate_cost=calculate_cost,
)
@@ -318,6 +320,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
include_attributes=self.settings.include_attributes,
message_context=self.settings.message_context,
sensitive_data=sensitive_data,
+ max_history_items=self.settings.max_history_items,
)
if isinstance(browser, BrowserSession):
diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index 6637facfc..08feb8cfa 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
Action Results: Your actions and their results
-and system messages wrapped in tag.
+and system messages wrapped in tag.
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in
Examples:
[33]
User form
-\t*[35]*
+\t[35]
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with \* are new elements that were added after the previous step (if url has not changed)
+- Elements tagged with are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Pure text elements without [] are not interactive.
@@ -90,15 +90,12 @@ Strictly follow these rules while using the browser and navigating the web:
- You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
-- Your file system is initialized with two files:
- 1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
- 2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
-- You can read, write, and append to files.
+- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
- Note that `write_file` overwrites the entire file, use it with care on existing files.
- When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
-- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
-- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
-- If exists, includes files you have downloaded or uploaded by the user. You DON'T HAVE write access to these files. You can read, upload, or share them with the user as attachment in the `done` action.
+- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
+- If exists, includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
+- If the task is really long, initialize a `results.md` file to accumulate your results.
- DO NOT use the file system if the task is less than 5 steps!
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index 36d2c8698..22d066d8a 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
Action Results: Your actions and their results
-and system messages wrapped in tag.
+and system messages wrapped in tag.
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in
Examples:
[33]