Merge branch 'main' into feat/evals-anchor-support

2026-05-06 17:52:15 +02:00 · 2025-07-02 14:23:45 +02:00
parent 0c225d2afa 31e243d789
commit 8a5ca6ceec
19 changed files with 307 additions and 163 deletions
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -7,7 +7,9 @@ on:

 jobs:
  run_evaluation:
-    runs-on: ubuntu-latest
+    runs-on: 
+      group: eval
+      labels: eval-2-core-500
    timeout-minutes: 360
    env:
      IN_DOCKER: 'true'
@@ -104,6 +106,13 @@ jobs:
          ps aux | wc -l
          echo "================================="

+      - name: Construct GitHub Workflow URL
+        id: github_url
+        run: |
+          GITHUB_WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          echo "GITHUB_WORKFLOW_URL=$GITHUB_WORKFLOW_URL" >> $GITHUB_OUTPUT
+          echo "::notice title=Workflow URL::Workflow URL: $GITHUB_WORKFLOW_URL"
+
      - name: Construct eval command
        id: eval_command
        run: |
@@ -216,6 +225,9 @@ jobs:
          [[ -n "$TASK_TEXT" ]] && CMD_ARGS+=("--task-text" "$TASK_TEXT")
          [[ -n "$TASK_WEBSITE" ]] && CMD_ARGS+=("--task-website" "$TASK_WEBSITE")

+          # Add GitHub workflow URL
+          [[ -n "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}" ]] && CMD_ARGS+=("--github-workflow-url" "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}")
+
          # Convert array to command string with proper escaping
          printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"

--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -4,8 +4,7 @@ import json
 import logging

 from browser_use.agent.message_manager.views import (
-	MessageMetadata,
-	SupportedMessageTypes,
+	HistoryItem,
 )
 from browser_use.agent.prompts import AgentMessagePrompt
 from browser_use.agent.views import (
@@ -106,6 +105,7 @@ class MessageManager:
 		include_attributes: list[str] | None = None,
 		message_context: str | None = None,
 		sensitive_data: dict[str, str | dict[str, str]] | None = None,
+		max_history_items: int | None = None,
 	):
 		self.task = task
 		self.state = state
@@ -114,6 +114,9 @@ class MessageManager:
 		self.sensitive_data_description = ''
 		self.available_file_paths = available_file_paths
 		self.use_thinking = use_thinking
+		self.max_history_items = max_history_items
+
+		assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'

 		# Store settings as direct attributes instead of in a settings object
 		self.include_attributes = include_attributes or []
@@ -124,16 +127,45 @@ class MessageManager:
 		if len(self.state.history.messages) == 0:
 			self._init_messages()

+	@property
+	def agent_history_description(self) -> str:
+		"""Build agent history description from list of items, respecting max_history_items limit"""
+		if self.max_history_items is None:
+			# Include all items
+			return '\n'.join(item.to_string() for item in self.state.agent_history_items)
+
+		total_items = len(self.state.agent_history_items)
+
+		# If we have fewer items than the limit, just return all items
+		if total_items <= self.max_history_items:
+			return '\n'.join(item.to_string() for item in self.state.agent_history_items)
+
+		# We have more items than the limit, so we need to omit some
+		omitted_count = total_items - self.max_history_items
+
+		# Show first item + omitted message + most recent (max_history_items - 1) items
+		# The omitted message doesn't count against the limit, only real history items do
+		recent_items_count = self.max_history_items - 1  # -1 for first item
+
+		items_to_include = [
+			self.state.agent_history_items[0].to_string(),  # Keep first item (initialization)
+			f'<sys>[... {omitted_count} previous steps omitted...]</sys>',
+		]
+		# Add most recent items
+		items_to_include.extend([item.to_string() for item in self.state.agent_history_items[-recent_items_count:]])
+
+		return '\n'.join(items_to_include)
+
 	def _init_messages(self) -> None:
 		"""Initialize the message history with system message, context, task, and other initial messages"""
-		self._add_message_with_type(self.system_prompt, message_type='init')
+		self._add_message_with_type(self.system_prompt)

 		placeholder_message = UserMessage(
 			content='<example_1>\nHere is an example output of thinking and tool call. You can use it as a reference but do not copy it exactly.',
 			cache=True,
 		)
 		# placeholder_message = HumanMessage(content='Example output:')
-		self._add_message_with_type(placeholder_message, message_type='init')
+		self._add_message_with_type(placeholder_message)

 		# Create base example content
 		example_content = {
@@ -173,18 +205,18 @@ After writing todo.md, I can also initialize a github.md file to accumulate the
 The file system actions do not change the browser state, so I can also click on the bytedance/UI-TARS-desktop (index [4]) to start collecting information."""

 		example_tool_call_1 = AssistantMessage(content=json.dumps(example_content), cache=True)
-		self._add_message_with_type(example_tool_call_1, message_type='init')
+		self._add_message_with_type(example_tool_call_1)
 		self._add_message_with_type(
 			UserMessage(
 				content='Data written to todo.md.\nData written to github.md.\nClicked element with index 4.\n</example_1>',
 				cache=True,
 			),
-			message_type='init',
 		)

 	def add_new_task(self, new_task: str) -> None:
 		self.task = new_task
-		self.state.agent_history_description += f'\n<s>User updated <user_request> to: {new_task}</s>\n'
+		task_update_item = HistoryItem(system_message=f'User updated <user_request> to: {new_task}')
+		self.state.agent_history_items.append(task_update_item)

 	def _update_agent_history_description(
 		self,
@@ -196,7 +228,7 @@ The file system actions do not change the browser state, so I can also click on

 		if result is None:
 			result = []
-		step_number = step_info.step_number if step_info else 'unknown'
+		step_number = step_info.step_number if step_info else None

 		self.state.read_state_description = ''

@@ -220,23 +252,23 @@ The file system actions do not change the browser state, so I can also click on

 		if action_results:
 			action_results = f'Action Results:\n{action_results}'
-		action_results = action_results.strip('\n')
+		action_results = action_results.strip('\n') if action_results else None

-		# Handle case where model_output is None (e.g., parsing failed)
+		# Build the history item
 		if model_output is None:
-			if isinstance(step_number, int) and step_number > 0:
-				self.state.agent_history_description += f"""<step_{step_number}>
-Agent failed to output in the right format.
-</step_{step_number}>
-"""
+			# Only add error history item if we have a valid step number
+			if step_number is not None and step_number > 0:
+				history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
+				self.state.agent_history_items.append(history_item)
 		else:
-			self.state.agent_history_description += f"""<step_{step_number}>
-Evaluation of Previous Step: {model_output.current_state.evaluation_previous_goal}
-Memory: {model_output.current_state.memory}
-Next Goal: {model_output.current_state.next_goal}
-{action_results}
-</step_{step_number}>
-"""
+			history_item = HistoryItem(
+				step_number=step_number,
+				evaluation_previous_goal=model_output.current_state.evaluation_previous_goal,
+				memory=model_output.current_state.memory,
+				next_goal=model_output.current_state.next_goal,
+				action_results=action_results,
+			)
+			self.state.agent_history_items.append(history_item)

 	def _get_sensitive_data_description(self, current_page_url) -> str:
 		sensitive_data = self.sensitive_data
@@ -284,7 +316,7 @@ Next Goal: {model_output.current_state.next_goal}
 		state_message = AgentMessagePrompt(
 			browser_state_summary=browser_state_summary,
 			file_system=self.file_system,
-			agent_history_description=self.state.agent_history_description,
+			agent_history_description=self.agent_history_description,
 			read_state_description=self.state.read_state_description,
 			task=self.task,
 			include_attributes=self.include_attributes,
@@ -346,16 +378,15 @@ Next Goal: {model_output.current_state.next_goal}

 		# Log message history for debugging
 		logger.debug(self._log_history_lines())
-		self.last_input_messages = [m.message for m in self.state.history.messages]
+		self.last_input_messages = list(self.state.history.messages)
 		return self.last_input_messages

 	def _add_message_with_type(
 		self,
 		message: BaseMessage,
 		position: int | None = None,
-		message_type: SupportedMessageTypes | None = None,
 	) -> None:
-		"""Add message with token count metadata
+		"""Add message to history
 		position: None for last, -1 for second last, etc.
 		"""

@@ -363,8 +394,7 @@ Next Goal: {model_output.current_state.next_goal}
 		if self.sensitive_data:
 			message = self._filter_sensitive_data(message)

-		metadata = MessageMetadata(message_type=message_type)
-		self.state.history.add_message(message, metadata, position)
+		self.state.history.add_message(message, position)

 	@time_execution_sync('--filter_sensitive_data')
 	def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage:
--- a/browser_use/agent/message_manager/views.py
+++ b/browser_use/agent/message_manager/views.py
@@ -1,6 +1,6 @@
 from __future__ import annotations

-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING

 from pydantic import BaseModel, ConfigDict, Field

@@ -13,43 +13,74 @@ if TYPE_CHECKING:
 	pass


-SupportedMessageTypes = Literal['init', 'memory']
+class HistoryItem(BaseModel):
+	"""Represents a single agent history item with its data and string representation"""

-
-class MessageMetadata(BaseModel):
-	"""Metadata for a message"""
-
-	message_type: SupportedMessageTypes | None = None
-
-
-class ManagedMessage(BaseModel):
-	"""A message with its metadata"""
-
-	message: BaseMessage
-	metadata: MessageMetadata = Field(default_factory=MessageMetadata)
-
-
-class MessageHistory(BaseModel):
-	"""History of messages with metadata"""
-
-	messages: list[ManagedMessage] = Field(default_factory=list)
+	step_number: int | None = None
+	evaluation_previous_goal: str | None = None
+	memory: str | None = None
+	next_goal: str | None = None
+	action_results: str | None = None
+	error: str | None = None
+	system_message: str | None = None

 	model_config = ConfigDict(arbitrary_types_allowed=True)

-	def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
-		"""Add message with metadata to history"""
-		if position is None:
-			self.messages.append(ManagedMessage(message=message, metadata=metadata))
+	def model_post_init(self, __context) -> None:
+		"""Validate that error and system_message are not both provided"""
+		if self.error is not None and self.system_message is not None:
+			raise ValueError('Cannot have both error and system_message at the same time')
+
+	def to_string(self) -> str:
+		"""Get string representation of the history item"""
+		step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
+
+		if self.error:
+			return f"""<{step_str}>
+{self.error}
+</{step_str}>"""
+		elif self.system_message:
+			return f"""<sys>
+{self.system_message}
+</sys>"""
 		else:
-			self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
+			content_parts = [
+				f'Evaluation of Previous Step: {self.evaluation_previous_goal}',
+				f'Memory: {self.memory}',
+				f'Next Goal: {self.next_goal}',
+			]
+
+			if self.action_results:
+				content_parts.append(self.action_results)
+
+			content = '\n'.join(content_parts)
+
+			return f"""<{step_str}>
+{content}
+</{step_str}>"""
+
+
+class MessageHistory(BaseModel):
+	"""History of messages"""
+
+	messages: list[BaseMessage] = Field(default_factory=list)
+
+	model_config = ConfigDict(arbitrary_types_allowed=True)
+
+	def add_message(self, message: BaseMessage, position: int | None = None) -> None:
+		"""Add message to history"""
+		if position is None:
+			self.messages.append(message)
+		else:
+			self.messages.insert(position, message)

 	def get_messages(self) -> list[BaseMessage]:
 		"""Get all messages"""
-		return [m.message for m in self.messages]
+		return self.messages

 	def remove_last_state_message(self) -> None:
 		"""Remove last state message from history"""
-		if len(self.messages) > 2 and isinstance(self.messages[-1].message, UserMessage):
+		if len(self.messages) > 2 and isinstance(self.messages[-1], UserMessage):
 			self.messages.pop()


@@ -58,7 +89,9 @@ class MessageManagerState(BaseModel):

 	history: MessageHistory = Field(default_factory=MessageHistory)
 	tool_id: int = 1
-	agent_history_description: str = '<s>Agent initialized</s>\n'
+	agent_history_items: list[HistoryItem] = Field(
+		default_factory=lambda: [HistoryItem(step_number=0, system_message='Agent initialized')]
+	)
 	read_state_description: str = ''

 	model_config = ConfigDict(arbitrary_types_allowed=True)
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -169,6 +169,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		],
 		max_actions_per_step: int = 10,
 		use_thinking: bool = True,
+		max_history_items: int = 40,
 		page_extraction_llm: BaseChatModel | None = None,
 		planner_llm: BaseChatModel | None = None,
 		planner_interval: int = 1,  # Run planner every N steps
@@ -235,12 +236,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			available_file_paths=available_file_paths,
 			include_attributes=include_attributes,
 			max_actions_per_step=max_actions_per_step,
+			use_thinking=use_thinking,
+			max_history_items=max_history_items,
 			page_extraction_llm=page_extraction_llm,
 			planner_llm=planner_llm,
 			planner_interval=planner_interval,
 			is_planner_reasoning=is_planner_reasoning,
 			extend_planner_system_message=extend_planner_system_message,
-			use_thinking=use_thinking,
 			calculate_cost=calculate_cost,
 		)

@@ -318,6 +320,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			include_attributes=self.settings.include_attributes,
 			message_context=self.settings.message_context,
 			sensitive_data=sensitive_data,
+			max_history_items=self.settings.max_history_items,
 		)

 		if isinstance(browser, BrowserSession):
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
 Action Results: Your actions and their results
 </step_{{step_number}}>

-and system messages wrapped in <s> tag.
+and system messages wrapped in <sys> tag.
 </agent_history>

 <user_request>
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in

 Examples:
 [33]<div>User form</div>
-\t*[35]*<button aria-label='Submit form'>Submit</button>
+\t<new>[35]</new><button aria-label='Submit form'>Submit</button>

 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements with \* are new elements that were added after the previous step (if url has not changed)
+- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed.
 - Pure text elements without [] are not interactive.
 </browser_state>

@@ -90,15 +90,12 @@ Strictly follow these rules while using the browser and navigating the web:

 <file_system>
 - You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
- Your file system is initialized with two files:
-  1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
-  2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
- You can read, write, and append to files.
+- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
 - Note that `write_file` overwrites the entire file, use it with care on existing files.
 - When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You DON'T HAVE write access to these files. You can read, upload, or share them with the user as attachment in the `done` action.
+- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
+- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
+- If the task is really long, initialize a `results.md` file to accumulate your results.
 - DO NOT use the file system if the task is less than 5 steps!
 </file_system>

--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
 Action Results: Your actions and their results
 </step_{{step_number}}>

-and system messages wrapped in <s> tag.
+and system messages wrapped in <sys> tag.
 </agent_history>

 <user_request>
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in

 Examples:
 [33]<div>User form</div>
-\t*[35]*<button aria-label='Submit form'>Submit</button>
+\t<new>[35]</new><button aria-label='Submit form'>Submit</button>

 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements with \* are new elements that were added after the previous step (if url has not changed)
+- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed.
 - Pure text elements without [] are not interactive.
 </browser_state>

@@ -90,15 +90,12 @@ Strictly follow these rules while using the browser and navigating the web:

 <file_system>
 - You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
- Your file system is initialized with two files:
-  1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
-  2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
- You can read, write, and append to files.
+- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
 - Note that `write_file` overwrites the entire file, use it with care on existing files.
 - When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
+- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
 - If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
+- If the task is really long, initialize a `results.md` file to accumulate your results.
 - DO NOT use the file system if the task is less than 5 steps!
 </file_system>

--- a/browser_use/agent/views.py
+++ b/browser_use/agent/views.py
@@ -54,6 +54,7 @@ class AgentSettings(BaseModel):
 	]
 	max_actions_per_step: int = 10
 	use_thinking: bool = True
+	max_history_items: int = 40

 	page_extraction_llm: BaseChatModel | None = None
 	planner_llm: BaseChatModel | None = None
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -25,7 +25,7 @@ os.environ['PW_TEST_SCREENSHOT_NO_FONTS_READY'] = '1'  # https://github.com/micr

 import anyio
 import psutil
-from playwright._impl._api_structures import FloatRect, ViewportSize
+from playwright._impl._api_structures import ViewportSize
 from pydantic import AliasChoices, BaseModel, ConfigDict, Field, InstanceOf, PrivateAttr, model_validator
 from uuid_extensions import uuid7str

@@ -248,12 +248,12 @@ class BrowserSession(BaseModel):
 		return self._logger

 	def __repr__(self) -> str:
-		is_copy = '©' if self._original_browser_session else '1️⃣ '
-		return f'BrowserSession🆂 {self.id[-4:]}{is_copy}{str(id(self))[-2:]} ({self._connection_str}, profile={self.browser_profile})'
+		is_copy = '©' if self._original_browser_session else '#'
+		return f'BrowserSession🆂 {self.id[-4:]} {is_copy}{str(id(self))[-2:]} ({self._connection_str}, profile={self.browser_profile})'

 	def __str__(self) -> str:
-		is_copy = '©' if self._original_browser_session else '1️⃣ '
-		return f'BrowserSession🆂 {self.id[-4:]}{is_copy}{str(id(self))[-2:]} 🅟 {str(id(self.agent_current_page))[-2:]}'
+		is_copy = '©' if self._original_browser_session else '#'
+		return f'BrowserSession🆂 {self.id[-4:]} {is_copy}{str(id(self))[-2:]} 🅟 {str(id(self.agent_current_page))[-2:]}'

 	# better to force people to get it from the right object, "only one way to do it" is better python
 	# def __getattr__(self, key: str) -> Any:
@@ -725,7 +725,7 @@ class BrowserSession(BaseModel):
 				full_page=False,
 				# scale='css',
 				timeout=self.browser_profile.default_timeout or 30000,
-				clip=FloatRect(**clip) if clip else None,
+				# clip=FloatRect(**clip) if clip else None,
 				animations='allow',
 				caret='initial',
 			)
@@ -2714,27 +2714,28 @@ class BrowserSession(BaseModel):
 			# This prevents timeouts on very long pages

 			# 1. Get current viewport and page dimensions including scroll position
-			dimensions = await page.evaluate("""() => {
-				return {
-					width: window.innerWidth,
-					height: window.innerHeight,
-					pageHeight: document.documentElement.scrollHeight,
-					devicePixelRatio: window.devicePixelRatio || 1,
-					scrollX: window.pageXOffset || document.documentElement.scrollLeft || 0,
-					scrollY: window.pageYOffset || document.documentElement.scrollTop || 0
-				};
-			}""")
+			# dimensions = await page.evaluate("""() => {
+			# 	return {
+			# 		width: window.innerWidth,
+			# 		height: window.innerHeight,
+			# 		pageWidth: document.documentElement.scrollWidth,
+			# 		pageHeight: document.documentElement.scrollHeight,
+			# 		devicePixelRatio: window.devicePixelRatio || 1,
+			# 		scrollX: window.pageXOffset || document.documentElement.scrollLeft || 0,
+			# 		scrollY: window.pageYOffset || document.documentElement.scrollTop || 0
+			# 	};
+			# }""")
+
+			# When full_page=False, screenshot captures the current viewport
+			# The clip parameter uses viewport coordinates (0,0 is top-left of viewport)
+			# We just need to ensure the clip dimensions don't exceed our maximums
+			# clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH)
+			# clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT)

 			# Take screenshot using our retry-decorated method
-			return await self._take_screenshot_hybrid(
-				page,
-				clip={
-					'x': dimensions['scrollX'],
-					'y': dimensions['scrollY'],
-					'width': min(dimensions['width'], MAX_SCREENSHOT_WIDTH),
-					'height': min(dimensions['height'], MAX_SCREENSHOT_HEIGHT),
-				},
-			)
+			# Don't pass clip parameter - let Playwright capture the full viewport
+			# It will automatically handle cases where viewport extends beyond page content
+			return await self._take_screenshot_hybrid(page)
 		except Exception as e:
 			self.logger.error(f'❌ Failed to take screenshot after retries: {type(e).__name__}: {e}')
 			raise
--- a/browser_use/cli.py
+++ b/browser_use/cli.py
@@ -855,8 +855,8 @@ class BrowserUseApp(App):
 				# Extract original task(s)
 				original_tasks = []
 				for msg in message_history:
-					if hasattr(msg, 'message') and hasattr(msg.message, 'content'):
-						content = msg.message.content
+					if hasattr(msg, 'content'):
+						content = msg.content
 						if isinstance(content, str) and 'Your ultimate task is:' in content:
 							task_text = content.split('"""')[1].strip()
 							original_tasks.append(task_text)
--- a/browser_use/controller/service.py
+++ b/browser_use/controller/service.py
@@ -331,10 +331,12 @@ class Controller(Generic[Context]):
 		@self.registry.action(
 			"""Extract structured, semantic data (e.g. product description, price, all information about XYZ) from the current webpage based on a textual query.
 Only use this for extracting info from a single product/article page, not for entire listings or search results pages.
+Set extract_links=True ONLY if your query requires extracting links/URLs from the page.
 """,
 		)
 		async def extract_structured_data(
 			query: str,
+			extract_links: bool,
 			page: Page,
 			page_extraction_llm: BaseChatModel,
 			file_system: FileSystem,
@@ -344,13 +346,8 @@ Only use this for extracting info from a single product/article page, not for en
 			import markdownify

 			strip = []
-			include_links = False
-			lower_query = query.lower()
-			url_keywords = ['url', 'links']
-			if any(keyword in lower_query for keyword in url_keywords):
-				include_links = True

-			if not include_links:
+			if not extract_links:
 				strip = ['a', 'img']

 			# Run markdownify in a thread pool to avoid blocking the event loop
--- a/browser_use/controller/views.py
+++ b/browser_use/controller/views.py
@@ -15,13 +15,11 @@ class GoToUrlAction(BaseModel):

 class ClickElementAction(BaseModel):
 	index: int
-	xpath: str | None = None


 class InputTextAction(BaseModel):
 	index: int
 	text: str
-	xpath: str | None = None


 class DoneAction(BaseModel):
--- a/browser_use/dom/views.py
+++ b/browser_use/dom/views.py
@@ -195,7 +195,7 @@ class DOMElementNode(DOMBaseNode):

 					# Build the line
 					if node.is_new:
-						highlight_indicator = f'*[{node.highlight_index}]*'
+						highlight_indicator = f'<new>[{node.highlight_index}]</new>'
 					else:
 						highlight_indicator = f'[{node.highlight_index}]'

--- a/browser_use/filesystem/file_system.py
+++ b/browser_use/filesystem/file_system.py
@@ -124,7 +124,7 @@ class FileSystem:

 		self.files = {}
 		if create_default_files:
-			self.default_files = ['results.md', 'todo.md']
+			self.default_files = ['todo.md']
 			self._create_default_files()

 		self.extracted_content_count = 0
--- a/browser_use/llm/openai/chat.py
+++ b/browser_use/llm/openai/chat.py
@@ -6,6 +6,7 @@ import httpx
 from openai import APIConnectionError, APIStatusError, AsyncOpenAI, RateLimitError
 from openai.types.chat.chat_completion import ChatCompletion
 from openai.types.shared.chat_model import ChatModel
+from openai.types.shared_params.reasoning_effort import ReasoningEffort
 from openai.types.shared_params.response_format_json_schema import JSONSchema, ResponseFormatJSONSchema
 from pydantic import BaseModel

@@ -18,6 +19,8 @@ from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage

 T = TypeVar('T', bound=BaseModel)

+ReasoningModels: list[ChatModel | str] = ['o4-mini', 'o3', 'o3-mini', 'o1', 'o1-pro', 'o3-pro']
+

@dataclass
 class ChatOpenAI(BaseChatModel):
@@ -33,6 +36,7 @@ class ChatOpenAI(BaseChatModel):

 	# Model params
 	temperature: float | None = None
+	reasoning_effort: ReasoningEffort = 'low'

 	# Client initialization parameters
 	api_key: str | None = None
@@ -132,10 +136,19 @@ class ChatOpenAI(BaseChatModel):
 		openai_messages = OpenAIMessageSerializer.serialize_messages(messages)

 		try:
+			reasoning_effort_dict: dict = {}
+			if self.model in ReasoningModels:
+				reasoning_effort_dict = {
+					'reasoning_effort': self.reasoning_effort,
+				}
+
 			if output_format is None:
 				# Return string response
 				response = await self.get_client().chat.completions.create(
-					model=self.model, messages=openai_messages, temperature=self.temperature
+					model=self.model,
+					messages=openai_messages,
+					temperature=self.temperature,
+					**reasoning_effort_dict,
 				)

 				usage = self._get_usage(response)
@@ -157,6 +170,7 @@ class ChatOpenAI(BaseChatModel):
 					messages=openai_messages,
 					temperature=self.temperature,
 					response_format=ResponseFormatJSONSchema(json_schema=response_format, type='json_schema'),
+					**reasoning_effort_dict,
 				)

 				if response.choices[0].message.content is None:
--- a/eval/service.py
+++ b/eval/service.py
@@ -578,6 +578,7 @@ class TaskResult:
 	task: Any
 	max_steps: int
 	laminar_link: str | None = None
+	github_workflow_url: str | None = None
 	completed_stages: set[Stage] = field(default_factory=set)
 	stage_data: dict[Stage, Any] = field(default_factory=dict)
 	errors: list = field(default_factory=list)
@@ -619,6 +620,7 @@ class TaskResult:
 			'critical_error': self.critical_error,
 			'server_save_failed': self.server_save_failed,
 			'laminarTaskLink': self.laminar_link,
+			'githubWorkflowUrl': self.github_workflow_url,
 		}

 		# Add task execution data if available
@@ -759,8 +761,8 @@ SUPPORTED_MODELS = {
 	'gemini-1.5-flash': {'provider': 'google', 'model_name': 'gemini-1.5-flash-latest', 'api_key_env': 'GEMINI_API_KEY'},
 	'gemini-2.0-flash-lite': {'provider': 'google', 'model_name': 'gemini-2.0-flash-lite', 'api_key_env': 'GEMINI_API_KEY'},
 	'gemini-2.0-flash': {'provider': 'google', 'model_name': 'gemini-2.0-flash', 'api_key_env': 'GEMINI_API_KEY'},
-	'gemini-2.5-pro': {'provider': 'google', 'model_name': 'gemini-2.5-pro-preview-03-25', 'api_key_env': 'GEMINI_API_KEY'},
-	'gemini-2.5-flash': {'provider': 'google', 'model_name': 'gemini-2.5-flash-latest', 'api_key_env': 'GEMINI_API_KEY'},
+	'gemini-2.5-pro': {'provider': 'google', 'model_name': 'gemini-2.5-pro', 'api_key_env': 'GEMINI_API_KEY'},
+	'gemini-2.5-flash': {'provider': 'google', 'model_name': 'gemini-2.5-flash', 'api_key_env': 'GEMINI_API_KEY'},
 	'gemini-2.5-pro-preview-05-06': {
 		'provider': 'google',
 		'model_name': 'gemini-2.5-pro-preview-05-06',
@@ -774,6 +776,7 @@ SUPPORTED_MODELS = {
 	# OpenAI
 	'gpt-4.1': {'provider': 'openai', 'model_name': 'gpt-4.1-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4.1-mini': {'provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
+	'gpt-o3': {'provider': 'openai', 'model_name': 'o3-2025-04-16', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4.1-nano': {'provider': 'openai', 'model_name': 'gpt-4.1-nano-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4o': {'provider': 'openai', 'model_name': 'gpt-4o', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4o-mini': {'provider': 'openai', 'model_name': 'gpt-4o-mini', 'api_key_env': 'OPENAI_API_KEY'},
@@ -924,7 +927,7 @@ def get_llm(model_name: str):
 		case 'openai':
 			kwargs = {'model': config['model_name'], 'temperature': 0.0}
 			# Must set temperatue=1 if model is gpt-o4-mini
-			if model_name == 'gpt-o4-mini':
+			if model_name in ['gpt-o4-mini', 'gpt-o3']:
 				kwargs['temperature'] = 1
 			if api_key:
 				kwargs['api_key'] = api_key
@@ -1289,8 +1292,9 @@ async def judge_task_result(model, task_folder: Path, score_threshold: float = 3

 			try:
 				# Run comprehensive judge evaluation
-				comprehensive_result = await evaluate_task_with_comprehensive_judge(
-					task_folder=task_folder, model=model, max_images=10
+				comprehensive_result = await asyncio.wait_for(
+					evaluate_task_with_comprehensive_judge(task_folder=task_folder, model=model, max_images=10),
+					timeout=180,  # 3 minutes max for evaluation
 				)

 				if comprehensive_result.get('error'):
@@ -1649,6 +1653,7 @@ async def run_task_with_semaphore(
 	headless: bool,
 	use_vision: bool,
 	semaphore_runs: asyncio.Semaphore,  # Pass semaphore as argument
+	github_workflow_url: str | None = None,
 	use_serp: bool = False,
 	use_anchor: bool = False,
 	enable_memory: bool = False,
@@ -1721,7 +1726,9 @@ async def run_task_with_semaphore(
 				logger.debug(f'Task {task.task_id}: No Laminar run ID available, skipping datapoint creation')

 				# Initialize task result and basic setup
-			task_result = TaskResult(task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link)
+			task_result = TaskResult(
+				task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link, github_workflow_url
+			)

 			task_folder = Path(f'saved_trajectories/{task.task_id}')

@@ -1917,7 +1924,13 @@ async def run_task_with_semaphore(
 				# Create minimal task result for server reporting
 				try:
 					task_result = TaskResult(
-						task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link
+						task.task_id,
+						run_id,
+						task.confirmed_task,
+						task,
+						max_steps_per_task,
+						laminar_task_link,
+						github_workflow_url,
 					)
 					task_result.mark_critical_error(f'Initialization failed: {str(init_error)}')
 				except Exception as result_error:
@@ -1978,6 +1991,7 @@ async def run_multiple_tasks(
 	convex_url: str,
 	secret_key: str,
 	eval_model: BaseChatModel,
+	github_workflow_url: str | None = None,
 	max_parallel_runs: int = 3,
 	max_steps_per_task: int = 25,
 	start_index: int = 0,
@@ -2063,6 +2077,7 @@ async def run_multiple_tasks(
 					headless=headless,
 					use_vision=use_vision,
 					semaphore_runs=semaphore_runs,  # Pass the semaphore
+					github_workflow_url=github_workflow_url,
 					use_serp=use_serp,
 					use_anchor=use_anchor,
 					enable_memory=enable_memory,
@@ -2326,6 +2341,7 @@ async def run_evaluation_pipeline(
 	convex_url: str,
 	secret_key: str,
 	eval_model: BaseChatModel,
+	github_workflow_url: str | None = None,
 	max_parallel_runs: int = 3,
 	max_steps_per_task: int = 25,
 	start_index: int = 0,
@@ -2379,6 +2395,7 @@ async def run_evaluation_pipeline(
 		convex_url=convex_url,
 		secret_key=secret_key,
 		eval_model=eval_model,
+		github_workflow_url=github_workflow_url,
 		max_parallel_runs=max_parallel_runs,
 		max_steps_per_task=max_steps_per_task,
 		start_index=start_index,
@@ -2463,6 +2480,7 @@ if __name__ == '__main__':
 	parser.add_argument('--use-mind2web-judge', action='store_true', help='Use original judge')
 	parser.add_argument('--no-thinking', action='store_true', help='Disable thinking in agent system prompt')
 	parser.add_argument('--use-anchor', action='store_true', help='Use anchor to navigate to the page')
+	parser.add_argument('--github-workflow-url', type=str, default=None, help='GitHub workflow URL for tracking')

 	# Single task mode arguments
 	parser.add_argument('--task-text', type=str, default=None, help='Task description for single task mode')
@@ -2705,6 +2723,7 @@ if __name__ == '__main__':
 				convex_url=convex_url,
 				secret_key=secret_key,
 				eval_model=eval_model,
+				github_workflow_url=args.github_workflow_url,
 				max_parallel_runs=parallel_runs,
 				max_steps_per_task=args.max_steps,
 				start_index=start_index,
--- a/examples/features/sensitive_data.py
+++ b/examples/features/sensitive_data.py
@@ -12,9 +12,16 @@ from browser_use import Agent
 from browser_use.browser import BrowserProfile
 from browser_use.llm import ChatOpenAI

+try:
+	from lmnr import Laminar
+
+	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
+except Exception as e:
+	print(f'Error initializing Laminar: {e}')
+
 # Initialize the model
 llm = ChatOpenAI(
-	model='gpt-4o',
+	model='gpt-4.1',
 	temperature=0.0,
 )
 # Simple case: the model will see x_name and x_password, but never the actual values.
@@ -35,7 +42,7 @@ sensitive_data: dict[str, str | dict[str, str]] = {
 	'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
 }
 # Update task to use one of the credentials above
-task = 'Go to example.com and login with company_username and company_password'
+task = 'Go to google.com and put the login information in the search bar.'

 # Always set allowed_domains when using sensitive_data for security
 from browser_use.browser.session import BrowserSession
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
    "google-api-core>=2.25.0",
    "httpx>=0.28.1",
    "markdownify==1.1.0",
-    "mem0ai>=0.1.106",
    "patchright>=1.52.5",
    "playwright>=1.52.0",
    "portalocker>=2.7.0,<3.0.0",
--- a/tests/ci/test_browser_session_screenshots.py
+++ b/tests/ci/test_browser_session_screenshots.py
@@ -2,6 +2,7 @@
 Test that screenshots work correctly in headless browser mode.
 """

+import asyncio
 import base64

 from browser_use.browser import BrowserProfile, BrowserSession
@@ -193,7 +194,7 @@ class TestHeadlessScreenshots:

 			# Take screenshots from all sessions at the same time
 			print('Taking screenshots from all 10 sessions simultaneously...')
-			screenshot_tasks = [session.take_screenshot(full_page=True) for session in browser_sessions]
+			screenshot_tasks = [session.take_screenshot() for session in browser_sessions]
 			screenshots = await asyncio.gather(*screenshot_tasks)

 			# Verify all screenshots are valid
@@ -221,9 +222,7 @@ class TestHeadlessScreenshots:

 			# Also test taking regular (viewport) screenshots in parallel
 			print('Taking viewport screenshots from all sessions simultaneously...')
-			viewport_screenshots = await asyncio.gather(
-				*[session.take_screenshot(full_page=False) for session in browser_sessions]
-			)
+			viewport_screenshots = await asyncio.gather(*[session.take_screenshot() for session in browser_sessions])

 			# Verify viewport screenshots
 			for i, screenshot in enumerate(viewport_screenshots):
@@ -244,3 +243,69 @@ class TestHeadlessScreenshots:
 			for i, result in enumerate(results):
 				if isinstance(result, Exception):
 					print(f'Warning: Session {i} kill raised exception: {type(result).__name__}: {result}')
+
+	async def test_screenshot_at_bottom_of_page(self, httpserver):
+		"""Test screenshot capture when scrolled to bottom of page (regression test for clipping issue)"""
+		browser_session = BrowserSession(
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=None,
+				keep_alive=False,
+			)
+		)
+
+		try:
+			await browser_session.start()
+
+			# Create a page with scrollable content
+			httpserver.expect_request('/scrollable').respond_with_data(
+				"""<html>
+				<head><title>Scrollable Page Test</title></head>
+				<body style="margin: 0; padding: 0;">
+					<div style="height: 3000px; background: linear-gradient(to bottom, red, yellow, green, blue);">
+						<div style="position: absolute; top: 0; left: 10px; font-size: 24px;">Top of page</div>
+						<div style="position: absolute; top: 50%; left: 10px; font-size: 24px;">Middle of page</div>
+						<div style="position: absolute; bottom: 10px; left: 10px; font-size: 24px;">Bottom of page</div>
+					</div>
+				</body>
+				</html>""",
+				content_type='text/html',
+			)
+
+			# Navigate to test page
+			await browser_session.navigate(httpserver.url_for('/scrollable'))
+			page = browser_session.agent_current_page
+			assert page is not None
+
+			# Test 1: Screenshot at top of page (should work)
+			screenshot_top = await browser_session.take_screenshot()
+			assert screenshot_top is not None
+			assert len(base64.b64decode(screenshot_top)) > 5000
+
+			# Test 2: Screenshot at middle of page
+			await page.evaluate('window.scrollTo(0, document.body.scrollHeight / 2)')
+			await asyncio.sleep(0.1)  # Wait for scroll
+			screenshot_middle = await browser_session.take_screenshot()
+			assert screenshot_middle is not None
+			assert len(base64.b64decode(screenshot_middle)) > 5000
+
+			# Test 3: Screenshot at bottom of page (this was failing with clipping error)
+			await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
+			await asyncio.sleep(0.1)  # Wait for scroll
+
+			# This should not raise "Clipped area is either empty or outside the resulting image" error
+			screenshot_bottom = await browser_session.take_screenshot()
+			assert screenshot_bottom is not None
+			assert len(base64.b64decode(screenshot_bottom)) > 5000
+
+			# Test 4: Screenshot when scrolled beyond page bottom (edge case)
+			await page.evaluate('window.scrollTo(0, document.body.scrollHeight + 1000)')
+			await asyncio.sleep(0.1)
+			screenshot_beyond = await browser_session.take_screenshot()
+			assert screenshot_beyond is not None
+			assert len(base64.b64decode(screenshot_beyond)) > 5000
+
+			print('✅ All screenshot positions tested successfully!')
+
+		finally:
+			await browser_session.stop()
--- a/tests/ci/test_filesystem.py
+++ b/tests/ci/test_filesystem.py
@@ -138,14 +138,11 @@ class TestFileSystem:
 		assert fs.data_dir.name == DEFAULT_FILE_SYSTEM_PATH

 		# Check default files are created
-		assert 'results.md' in fs.files
 		assert 'todo.md' in fs.files
-		assert len(fs.files) == 2
+		assert len(fs.files) == 1

 		# Check files exist on disk
-		results_path = fs.data_dir / 'results.md'
 		todo_path = fs.data_dir / 'todo.md'
-		assert results_path.exists()
 		assert todo_path.exists()

 	def test_filesystem_without_default_files(self, empty_filesystem):
@@ -199,12 +196,6 @@ class TestFileSystem:
 		"""Test getting files from the filesystem."""
 		fs = temp_filesystem

-		# Get existing file
-		results_file = fs.get_file('results.md')
-		assert results_file is not None
-		assert isinstance(results_file, MarkdownFile)
-		assert results_file.name == 'results'
-
 		# Get non-existent file
 		non_existent = fs.get_file('nonexistent.md')
 		assert non_existent is None
@@ -218,16 +209,15 @@ class TestFileSystem:
 		fs = temp_filesystem
 		files = fs.list_files()

-		assert 'results.md' in files
 		assert 'todo.md' in files
-		assert len(files) == 2
+		assert len(files) == 1

 	def test_display_file(self, temp_filesystem):
 		"""Test displaying file content."""
 		fs = temp_filesystem

 		# Display existing file
-		content = fs.display_file('results.md')
+		content = fs.display_file('todo.md')
 		assert content == ''  # Default files are empty

 		# Display non-existent file
@@ -243,8 +233,8 @@ class TestFileSystem:
 		fs = temp_filesystem

 		# Read existing empty file
-		result = fs.read_file('results.md')
-		expected = 'Read from file results.md.\n<content>\n\n</content>'
+		result = fs.read_file('todo.md')
+		expected = 'Read from file todo.md.\n<content>\n\n</content>'
 		assert result == expected

 		# Read non-existent file
@@ -326,17 +316,6 @@ class TestFileSystem:
 		assert content1 == 'First extracted content'
 		assert content2 == 'Second extracted content'

-	async def test_describe_empty_files(self, temp_filesystem):
-		"""Test describing filesystem with empty files."""
-		fs = temp_filesystem
-
-		description = fs.describe()
-
-		# Should contain results.md but not todo.md (excluded from description)
-		assert 'results.md' in description
-		assert 'todo.md' not in description
-		assert '[empty file]' in description
-
 	async def test_describe_with_content(self, temp_filesystem):
 		"""Test describing filesystem with files containing content."""
 		fs = temp_filesystem
@@ -392,15 +371,8 @@ class TestFileSystem:
 		assert isinstance(state, FileSystemState)
 		assert state.base_dir == str(fs.base_dir)
 		assert state.extracted_content_count == 0
-		assert 'results.md' in state.files
 		assert 'todo.md' in state.files

-		# Check file data structure
-		results_data = state.files['results.md']
-		assert results_data['type'] == 'MarkdownFile'
-		assert 'data' in results_data
-		assert results_data['data']['name'] == 'results'
-
 	async def test_from_state(self, temp_filesystem):
 		"""Test restoring filesystem from state."""
 		fs = temp_filesystem
@@ -503,7 +475,6 @@ class TestFileSystemEdgeCases:

 			# Custom file should be gone, default files should exist
 			assert not custom_file.exists()
-			assert (fs2.data_dir / 'results.md').exists()
 			assert (fs2.data_dir / 'todo.md').exists()

 			fs2.nuke()