From 385dedde77482e444fc03bcf7dd69dc1b9011247 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Mon, 30 Jun 2025 23:41:21 -0700
Subject: [PATCH 01/37] fix screenshot clip errors at end of scroll

---
 browser_use/browser/session.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
index 783a0afe1..480c21e43 100644
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -2718,6 +2718,7 @@ class BrowserSession(BaseModel):
 				return {
 					width: window.innerWidth,
 					height: window.innerHeight,
+					pageWidth: document.documentElement.scrollWidth,
 					pageHeight: document.documentElement.scrollHeight,
 					devicePixelRatio: window.devicePixelRatio || 1,
 					scrollX: window.pageXOffset || document.documentElement.scrollLeft || 0,
@@ -2725,14 +2726,38 @@ class BrowserSession(BaseModel):
 				};
 			}""")
 
+			# Calculate clip region that doesn't exceed page bounds
+			# The clip coordinates are in page coordinates, not viewport coordinates
+			clip_x = dimensions['scrollX']
+			clip_y = dimensions['scrollY']
+			clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH)
+			clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT)
+
+			# Ensure clip region doesn't exceed page boundaries
+			max_x = max(0, dimensions['pageWidth'] - clip_x)
+			max_y = max(0, dimensions['pageHeight'] - clip_y)
+			clip_width = min(clip_width, max_x)
+			clip_height = min(clip_height, max_y)
+
+			# Don't try to screenshot if the clip area would be empty
+			if clip_width <= 0 or clip_height <= 0:
+				self.logger.warning(
+					f'Screenshot clip area would be empty: width={clip_width}, height={clip_height}, scrollY={clip_y}, pageHeight={dimensions["pageHeight"]}'
+				)
+				# Fall back to capturing from top of viewport if we're scrolled past the page
+				clip_x = 0
+				clip_y = max(0, dimensions['pageHeight'] - dimensions['height'])
+				clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH, dimensions['pageWidth'])
+				clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT, dimensions['pageHeight'] - clip_y)
+
 			# Take screenshot using our retry-decorated method
 			return await self._take_screenshot_hybrid(
 				page,
 				clip={
-					'x': dimensions['scrollX'],
-					'y': dimensions['scrollY'],
-					'width': min(dimensions['width'], MAX_SCREENSHOT_WIDTH),
-					'height': min(dimensions['height'], MAX_SCREENSHOT_HEIGHT),
+					'x': clip_x,
+					'y': clip_y,
+					'width': clip_width,
+					'height': clip_height,
 				},
 			)
 		except Exception as e:

From 6170953b5e0c17aa264119f066bc2105f9711a4f Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Mon, 30 Jun 2025 23:47:20 -0700
Subject: [PATCH 02/37] disable screenshot clipping entirely for now

---
 browser_use/browser/session.py               | 40 +++---------
 tests/ci/test_browser_session_screenshots.py | 67 ++++++++++++++++++++
 2 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
index 480c21e43..828222016 100644
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -25,7 +25,7 @@ os.environ['PW_TEST_SCREENSHOT_NO_FONTS_READY'] = '1'  # https://github.com/micr
 
 import anyio
 import psutil
-from playwright._impl._api_structures import FloatRect, ViewportSize
+from playwright._impl._api_structures import ViewportSize
 from pydantic import AliasChoices, BaseModel, ConfigDict, Field, InstanceOf, PrivateAttr, model_validator
 from uuid_extensions import uuid7str
 
@@ -725,7 +725,7 @@ class BrowserSession(BaseModel):
 				full_page=False,
 				# scale='css',
 				timeout=self.browser_profile.default_timeout or 30000,
-				clip=FloatRect(**clip) if clip else None,
+				# clip=FloatRect(**clip) if clip else None,
 				animations='allow',
 				caret='initial',
 			)
@@ -2726,40 +2726,16 @@ class BrowserSession(BaseModel):
 				};
 			}""")
 
-			# Calculate clip region that doesn't exceed page bounds
-			# The clip coordinates are in page coordinates, not viewport coordinates
-			clip_x = dimensions['scrollX']
-			clip_y = dimensions['scrollY']
+			# When full_page=False, screenshot captures the current viewport
+			# The clip parameter uses viewport coordinates (0,0 is top-left of viewport)
+			# We just need to ensure the clip dimensions don't exceed our maximums
 			clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH)
 			clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT)
 
-			# Ensure clip region doesn't exceed page boundaries
-			max_x = max(0, dimensions['pageWidth'] - clip_x)
-			max_y = max(0, dimensions['pageHeight'] - clip_y)
-			clip_width = min(clip_width, max_x)
-			clip_height = min(clip_height, max_y)
-
-			# Don't try to screenshot if the clip area would be empty
-			if clip_width <= 0 or clip_height <= 0:
-				self.logger.warning(
-					f'Screenshot clip area would be empty: width={clip_width}, height={clip_height}, scrollY={clip_y}, pageHeight={dimensions["pageHeight"]}'
-				)
-				# Fall back to capturing from top of viewport if we're scrolled past the page
-				clip_x = 0
-				clip_y = max(0, dimensions['pageHeight'] - dimensions['height'])
-				clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH, dimensions['pageWidth'])
-				clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT, dimensions['pageHeight'] - clip_y)
-
 			# Take screenshot using our retry-decorated method
-			return await self._take_screenshot_hybrid(
-				page,
-				clip={
-					'x': clip_x,
-					'y': clip_y,
-					'width': clip_width,
-					'height': clip_height,
-				},
-			)
+			# Don't pass clip parameter - let Playwright capture the full viewport
+			# It will automatically handle cases where viewport extends beyond page content
+			return await self._take_screenshot_hybrid(page)
 		except Exception as e:
 			self.logger.error(f'❌ Failed to take screenshot after retries: {type(e).__name__}: {e}')
 			raise
diff --git a/tests/ci/test_browser_session_screenshots.py b/tests/ci/test_browser_session_screenshots.py
index b1c15c8c9..7c3428887 100644
--- a/tests/ci/test_browser_session_screenshots.py
+++ b/tests/ci/test_browser_session_screenshots.py
@@ -2,6 +2,7 @@
 Test that screenshots work correctly in headless browser mode.
 """
 
+import asyncio
 import base64
 
 from browser_use.browser import BrowserProfile, BrowserSession
@@ -244,3 +245,69 @@ class TestHeadlessScreenshots:
 			for i, result in enumerate(results):
 				if isinstance(result, Exception):
 					print(f'Warning: Session {i} kill raised exception: {type(result).__name__}: {result}')
+
+	async def test_screenshot_at_bottom_of_page(self, httpserver):
+		"""Test screenshot capture when scrolled to bottom of page (regression test for clipping issue)"""
+		browser_session = BrowserSession(
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=None,
+				keep_alive=False,
+			)
+		)
+
+		try:
+			await browser_session.start()
+
+			# Create a page with scrollable content
+			httpserver.expect_request('/scrollable').respond_with_data(
+				"""<html>
+				<head><title>Scrollable Page Test</title></head>
+				<body style="margin: 0; padding: 0;">
+					<div style="height: 3000px; background: linear-gradient(to bottom, red, yellow, green, blue);">
+						<div style="position: absolute; top: 0; left: 10px; font-size: 24px;">Top of page</div>
+						<div style="position: absolute; top: 50%; left: 10px; font-size: 24px;">Middle of page</div>
+						<div style="position: absolute; bottom: 10px; left: 10px; font-size: 24px;">Bottom of page</div>
+					</div>
+				</body>
+				</html>""",
+				content_type='text/html',
+			)
+
+			# Navigate to test page
+			await browser_session.navigate(httpserver.url_for('/scrollable'))
+			page = browser_session.agent_current_page
+			assert page is not None
+
+			# Test 1: Screenshot at top of page (should work)
+			screenshot_top = await browser_session.take_screenshot()
+			assert screenshot_top is not None
+			assert len(base64.b64decode(screenshot_top)) > 5000
+
+			# Test 2: Screenshot at middle of page
+			await page.evaluate('window.scrollTo(0, document.body.scrollHeight / 2)')
+			await asyncio.sleep(0.1)  # Wait for scroll
+			screenshot_middle = await browser_session.take_screenshot()
+			assert screenshot_middle is not None
+			assert len(base64.b64decode(screenshot_middle)) > 5000
+
+			# Test 3: Screenshot at bottom of page (this was failing with clipping error)
+			await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
+			await asyncio.sleep(0.1)  # Wait for scroll
+
+			# This should not raise "Clipped area is either empty or outside the resulting image" error
+			screenshot_bottom = await browser_session.take_screenshot()
+			assert screenshot_bottom is not None
+			assert len(base64.b64decode(screenshot_bottom)) > 5000
+
+			# Test 4: Screenshot when scrolled beyond page bottom (edge case)
+			await page.evaluate('window.scrollTo(0, document.body.scrollHeight + 1000)')
+			await asyncio.sleep(0.1)
+			screenshot_beyond = await browser_session.take_screenshot()
+			assert screenshot_beyond is not None
+			assert len(base64.b64decode(screenshot_beyond)) > 5000
+
+			print('✅ All screenshot positions tested successfully!')
+
+		finally:
+			await browser_session.stop()

From d9943ef33646ede9e3ff5cf9ecef6cb87fc13eeb Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Tue, 1 Jul 2025 00:41:32 -0700
Subject: [PATCH 03/37] tweak emojij

---
 browser_use/browser/session.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
index 828222016..a0a12c330 100644
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -248,12 +248,12 @@ class BrowserSession(BaseModel):
 		return self._logger
 
 	def __repr__(self) -> str:
-		is_copy = '©' if self._original_browser_session else '1️⃣ '
-		return f'BrowserSession🆂 {self.id[-4:]}{is_copy}{str(id(self))[-2:]} ({self._connection_str}, profile={self.browser_profile})'
+		is_copy = '©' if self._original_browser_session else '#'
+		return f'BrowserSession🆂 {self.id[-4:]} {is_copy}{str(id(self))[-2:]} ({self._connection_str}, profile={self.browser_profile})'
 
 	def __str__(self) -> str:
-		is_copy = '©' if self._original_browser_session else '1️⃣ '
-		return f'BrowserSession🆂 {self.id[-4:]}{is_copy}{str(id(self))[-2:]} 🅟 {str(id(self.agent_current_page))[-2:]}'
+		is_copy = '©' if self._original_browser_session else '#'
+		return f'BrowserSession🆂 {self.id[-4:]} {is_copy}{str(id(self))[-2:]} 🅟 {str(id(self.agent_current_page))[-2:]}'
 
 	# better to force people to get it from the right object, "only one way to do it" is better python
 	# def __getattr__(self, key: str) -> Any:

From 81765de5ec91e23d00f529792a3ed25ae4eb9085 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 10:09:04 +0200
Subject: [PATCH 04/37] Update evaluation workflow to specify Ubuntu runner
 with 8 cores

---
 .github/workflows/eval.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index f799e6b54..76af6c32e 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -7,7 +7,12 @@ on:
 
 jobs:
   run_evaluation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest-8-cores  
+    # Other options:
+    # ubuntu-latest          
+    # ubuntu-latest-4-cores  
+    # ubuntu-latest-8-cores  
+    # ubuntu-latest-16-cores 
     timeout-minutes: 360
     env:
       IN_DOCKER: 'true'

From 35120fc2531062fd3783bb2122dcc9e3221306fc Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 10:29:11 +0200
Subject: [PATCH 05/37] dont use fs for short tasks, dont initialize results.md

---
 browser_use/agent/system_prompt.md            | 11 ++----
 .../agent/system_prompt_no_thinking.md        |  9 ++---
 browser_use/filesystem/file_system.py         |  2 +-
 tests/ci/test_filesystem.py                   | 39 +++----------------
 4 files changed, 13 insertions(+), 48 deletions(-)

diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index 6637facfc..3d0fe3fd9 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -90,15 +90,12 @@ Strictly follow these rules while using the browser and navigating the web:
 
 <file_system>
 - You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
-- Your file system is initialized with two files:
-  1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
-  2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
-- You can read, write, and append to files.
+- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
 - Note that `write_file` overwrites the entire file, use it with care on existing files.
 - When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
-- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
-- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
-- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You DON'T HAVE write access to these files. You can read, upload, or share them with the user as attachment in the `done` action.
+- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
+- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
+- If the task is really long, initialize a `results.md` file to accumulate your results.
 - DO NOT use the file system if the task is less than 5 steps!
 </file_system>
 
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index 36d2c8698..60a470a61 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -90,15 +90,12 @@ Strictly follow these rules while using the browser and navigating the web:
 
 <file_system>
 - You have access to a persistent file system which you can use to track progress, store results, and manage long tasks.
-- Your file system is initialized with two files:
-  1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
-  2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
-- You can read, write, and append to files.
+- Your file system is initialized with a `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
 - Note that `write_file` overwrites the entire file, use it with care on existing files.
 - When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
-- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
-- Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
+- If the file is too large, you are only given a preview of your file. Use `read_file` to see the full content if necessary.
 - If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
+- If the task is really long, initialize a `results.md` file to accumulate your results.
 - DO NOT use the file system if the task is less than 5 steps!
 </file_system>
 
diff --git a/browser_use/filesystem/file_system.py b/browser_use/filesystem/file_system.py
index c4a97e18a..eb823e9fd 100644
--- a/browser_use/filesystem/file_system.py
+++ b/browser_use/filesystem/file_system.py
@@ -124,7 +124,7 @@ class FileSystem:
 
 		self.files = {}
 		if create_default_files:
-			self.default_files = ['results.md', 'todo.md']
+			self.default_files = ['todo.md']
 			self._create_default_files()
 
 		self.extracted_content_count = 0
diff --git a/tests/ci/test_filesystem.py b/tests/ci/test_filesystem.py
index 5f9a9b017..b005c432d 100644
--- a/tests/ci/test_filesystem.py
+++ b/tests/ci/test_filesystem.py
@@ -138,14 +138,11 @@ class TestFileSystem:
 		assert fs.data_dir.name == DEFAULT_FILE_SYSTEM_PATH
 
 		# Check default files are created
-		assert 'results.md' in fs.files
 		assert 'todo.md' in fs.files
-		assert len(fs.files) == 2
+		assert len(fs.files) == 1
 
 		# Check files exist on disk
-		results_path = fs.data_dir / 'results.md'
 		todo_path = fs.data_dir / 'todo.md'
-		assert results_path.exists()
 		assert todo_path.exists()
 
 	def test_filesystem_without_default_files(self, empty_filesystem):
@@ -199,12 +196,6 @@ class TestFileSystem:
 		"""Test getting files from the filesystem."""
 		fs = temp_filesystem
 
-		# Get existing file
-		results_file = fs.get_file('results.md')
-		assert results_file is not None
-		assert isinstance(results_file, MarkdownFile)
-		assert results_file.name == 'results'
-
 		# Get non-existent file
 		non_existent = fs.get_file('nonexistent.md')
 		assert non_existent is None
@@ -218,16 +209,15 @@ class TestFileSystem:
 		fs = temp_filesystem
 		files = fs.list_files()
 
-		assert 'results.md' in files
 		assert 'todo.md' in files
-		assert len(files) == 2
+		assert len(files) == 1
 
 	def test_display_file(self, temp_filesystem):
 		"""Test displaying file content."""
 		fs = temp_filesystem
 
 		# Display existing file
-		content = fs.display_file('results.md')
+		content = fs.display_file('todo.md')
 		assert content == ''  # Default files are empty
 
 		# Display non-existent file
@@ -243,8 +233,8 @@ class TestFileSystem:
 		fs = temp_filesystem
 
 		# Read existing empty file
-		result = fs.read_file('results.md')
-		expected = 'Read from file results.md.\n<content>\n\n</content>'
+		result = fs.read_file('todo.md')
+		expected = 'Read from file todo.md.\n<content>\n\n</content>'
 		assert result == expected
 
 		# Read non-existent file
@@ -326,17 +316,6 @@ class TestFileSystem:
 		assert content1 == 'First extracted content'
 		assert content2 == 'Second extracted content'
 
-	async def test_describe_empty_files(self, temp_filesystem):
-		"""Test describing filesystem with empty files."""
-		fs = temp_filesystem
-
-		description = fs.describe()
-
-		# Should contain results.md but not todo.md (excluded from description)
-		assert 'results.md' in description
-		assert 'todo.md' not in description
-		assert '[empty file]' in description
-
 	async def test_describe_with_content(self, temp_filesystem):
 		"""Test describing filesystem with files containing content."""
 		fs = temp_filesystem
@@ -392,15 +371,8 @@ class TestFileSystem:
 		assert isinstance(state, FileSystemState)
 		assert state.base_dir == str(fs.base_dir)
 		assert state.extracted_content_count == 0
-		assert 'results.md' in state.files
 		assert 'todo.md' in state.files
 
-		# Check file data structure
-		results_data = state.files['results.md']
-		assert results_data['type'] == 'MarkdownFile'
-		assert 'data' in results_data
-		assert results_data['data']['name'] == 'results'
-
 	async def test_from_state(self, temp_filesystem):
 		"""Test restoring filesystem from state."""
 		fs = temp_filesystem
@@ -503,7 +475,6 @@ class TestFileSystemEdgeCases:
 
 			# Custom file should be gone, default files should exist
 			assert not custom_file.exists()
-			assert (fs2.data_dir / 'results.md').exists()
 			assert (fs2.data_dir / 'todo.md').exists()
 
 			fs2.nuke()

From bd4066354a5e9dc9cc2e19dd22b5d3c245f98933 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 10:31:33 +0200
Subject: [PATCH 06/37] eval-runner-16-core

---
 .github/workflows/eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index 76af6c32e..dde86edb4 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   run_evaluation:
-    runs-on: ubuntu-latest-8-cores  
+    runs-on: ubuntu-latest-16-cores 
     # Other options:
     # ubuntu-latest          
     # ubuntu-latest-4-cores  

From ef30980fe97267b80f4cd8fbe5533a1d0a974b1b Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 10:34:23 +0200
Subject: [PATCH 07/37] add extract_links to structured data

---
 browser_use/controller/service.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py
index 782e8ae62..919b61b60 100644
--- a/browser_use/controller/service.py
+++ b/browser_use/controller/service.py
@@ -331,10 +331,12 @@ class Controller(Generic[Context]):
 		@self.registry.action(
 			"""Extract structured, semantic data (e.g. product description, price, all information about XYZ) from the current webpage based on a textual query.
 Only use this for extracting info from a single product/article page, not for entire listings or search results pages.
+Set extract_links=True ONLY if your query requires extracting links/URLs from the page.
 """,
 		)
 		async def extract_structured_data(
 			query: str,
+			extract_links: bool,
 			page: Page,
 			page_extraction_llm: BaseChatModel,
 			file_system: FileSystem,
@@ -344,13 +346,8 @@ Only use this for extracting info from a single product/article page, not for en
 			import markdownify
 
 			strip = []
-			include_links = False
-			lower_query = query.lower()
-			url_keywords = ['url', 'links']
-			if any(keyword in lower_query for keyword in url_keywords):
-				include_links = True
 
-			if not include_links:
+			if not extract_links:
 				strip = ['a', 'img']
 
 			# Run markdownify in a thread pool to avoid blocking the event loop

From bd45b185085bfb183668122683210c57a4d3d508 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 10:36:38 +0200
Subject: [PATCH 08/37] Update evaluation workflow to maintain Ubuntu runner
 with 16 cores

---
 .github/workflows/eval.yaml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index dde86edb4..5eaba9c30 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -7,12 +7,7 @@ on:
 
 jobs:
   run_evaluation:
-    runs-on: ubuntu-latest-16-cores 
-    # Other options:
-    # ubuntu-latest          
-    # ubuntu-latest-4-cores  
-    # ubuntu-latest-8-cores  
-    # ubuntu-latest-16-cores 
+    runs-on: ubuntu-latest-16-cores
     timeout-minutes: 360
     env:
       IN_DOCKER: 'true'

From 6a5e86204b490330b8d4ac39c3f5e9664d3477b3 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Tue, 1 Jul 2025 01:42:25 -0700
Subject: [PATCH 09/37] speed up screenshot tweaks

---
 browser_use/browser/session.py               | 26 ++++++++++----------
 tests/ci/test_browser_session_screenshots.py |  6 ++---
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
index a0a12c330..fa4cfceab 100644
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -2714,23 +2714,23 @@ class BrowserSession(BaseModel):
 			# This prevents timeouts on very long pages
 
 			# 1. Get current viewport and page dimensions including scroll position
-			dimensions = await page.evaluate("""() => {
-				return {
-					width: window.innerWidth,
-					height: window.innerHeight,
-					pageWidth: document.documentElement.scrollWidth,
-					pageHeight: document.documentElement.scrollHeight,
-					devicePixelRatio: window.devicePixelRatio || 1,
-					scrollX: window.pageXOffset || document.documentElement.scrollLeft || 0,
-					scrollY: window.pageYOffset || document.documentElement.scrollTop || 0
-				};
-			}""")
+			# dimensions = await page.evaluate("""() => {
+			# 	return {
+			# 		width: window.innerWidth,
+			# 		height: window.innerHeight,
+			# 		pageWidth: document.documentElement.scrollWidth,
+			# 		pageHeight: document.documentElement.scrollHeight,
+			# 		devicePixelRatio: window.devicePixelRatio || 1,
+			# 		scrollX: window.pageXOffset || document.documentElement.scrollLeft || 0,
+			# 		scrollY: window.pageYOffset || document.documentElement.scrollTop || 0
+			# 	};
+			# }""")
 
 			# When full_page=False, screenshot captures the current viewport
 			# The clip parameter uses viewport coordinates (0,0 is top-left of viewport)
 			# We just need to ensure the clip dimensions don't exceed our maximums
-			clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH)
-			clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT)
+			# clip_width = min(dimensions['width'], MAX_SCREENSHOT_WIDTH)
+			# clip_height = min(dimensions['height'], MAX_SCREENSHOT_HEIGHT)
 
 			# Take screenshot using our retry-decorated method
 			# Don't pass clip parameter - let Playwright capture the full viewport
diff --git a/tests/ci/test_browser_session_screenshots.py b/tests/ci/test_browser_session_screenshots.py
index 7c3428887..e07fdda8f 100644
--- a/tests/ci/test_browser_session_screenshots.py
+++ b/tests/ci/test_browser_session_screenshots.py
@@ -194,7 +194,7 @@ class TestHeadlessScreenshots:
 
 			# Take screenshots from all sessions at the same time
 			print('Taking screenshots from all 10 sessions simultaneously...')
-			screenshot_tasks = [session.take_screenshot(full_page=True) for session in browser_sessions]
+			screenshot_tasks = [session.take_screenshot() for session in browser_sessions]
 			screenshots = await asyncio.gather(*screenshot_tasks)
 
 			# Verify all screenshots are valid
@@ -222,9 +222,7 @@ class TestHeadlessScreenshots:
 
 			# Also test taking regular (viewport) screenshots in parallel
 			print('Taking viewport screenshots from all sessions simultaneously...')
-			viewport_screenshots = await asyncio.gather(
-				*[session.take_screenshot(full_page=False) for session in browser_sessions]
-			)
+			viewport_screenshots = await asyncio.gather(*[session.take_screenshot() for session in browser_sessions])
 
 			# Verify viewport screenshots
 			for i, screenshot in enumerate(viewport_screenshots):

From 4e2b5b2f5c18ab9f6f3d1d57980eedc2ac7fbb30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 10:45:42 +0200
Subject: [PATCH 10/37] Fix typo in evaluation workflow runner name from
 '16-cores' to '16-core'

---
 .github/workflows/eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index 5eaba9c30..df624b6cf 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   run_evaluation:
-    runs-on: ubuntu-latest-16-cores
+    runs-on: ubuntu-latest-16-core
     timeout-minutes: 360
     env:
       IN_DOCKER: 'true'

From 31b503fb4296594705510cf32db7fcbf392da0ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 11:14:07 +0200
Subject: [PATCH 11/37] Name group in eval

---
 .github/workflows/eval.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index df624b6cf..0587dd6b9 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -7,7 +7,9 @@ on:
 
 jobs:
   run_evaluation:
-    runs-on: ubuntu-latest-16-core
+    runs-on: 
+      group: eval
+      labels: ubuntu-latest-16-core
     timeout-minutes: 360
     env:
       IN_DOCKER: 'true'

From 2c465a898a2cbc93abc94cb81e649933182efdfe Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 12:04:27 +0200
Subject: [PATCH 12/37] - Introduce a max_history_items parameter to limit the
 memory of the model - changed the system messages to have <sys> tag instead
 of <s> to avoid confusion with HTML -  Got rid of MessageMetadata,
 SupportedMessageTypes and implemented cleaner MessageManagerState -
 Implemented a HistoryItem class to cleanly reconstruct agent history
 description

---
 browser_use/agent/message_manager/service.py  | 84 ++++++++++++-------
 browser_use/agent/message_manager/views.py    | 76 +++++++++++++----
 browser_use/agent/service.py                  |  5 +-
 browser_use/agent/system_prompt.md            |  2 +-
 .../agent/system_prompt_no_thinking.md        |  2 +-
 browser_use/agent/views.py                    |  1 +
 6 files changed, 121 insertions(+), 49 deletions(-)

diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
index 32c541d97..504b842d7 100644
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -4,8 +4,7 @@ import json
 import logging
 
 from browser_use.agent.message_manager.views import (
-	MessageMetadata,
-	SupportedMessageTypes,
+	HistoryItem,
 )
 from browser_use.agent.prompts import AgentMessagePrompt
 from browser_use.agent.views import (
@@ -106,6 +105,7 @@ class MessageManager:
 		include_attributes: list[str] | None = None,
 		message_context: str | None = None,
 		sensitive_data: dict[str, str | dict[str, str]] | None = None,
+		max_history_items: int | None = None,
 	):
 		self.task = task
 		self.state = state
@@ -114,6 +114,7 @@ class MessageManager:
 		self.sensitive_data_description = ''
 		self.available_file_paths = available_file_paths
 		self.use_thinking = use_thinking
+		self.max_history_items = max_history_items
 
 		# Store settings as direct attributes instead of in a settings object
 		self.include_attributes = include_attributes or []
@@ -124,16 +125,45 @@ class MessageManager:
 		if len(self.state.history.messages) == 0:
 			self._init_messages()
 
+	@property
+	def agent_history_description(self) -> str:
+		"""Build agent history description from list of items, respecting max_history_items limit"""
+		if self.max_history_items is None:
+			# Include all items
+			return '\n'.join(item.to_string() for item in self.state.agent_history_items)
+
+		total_items = len(self.state.agent_history_items)
+
+		# If we have fewer items than the limit, just return all items
+		if total_items <= self.max_history_items:
+			return '\n'.join(item.to_string() for item in self.state.agent_history_items)
+
+		# We have more items than the limit, so we need to omit some
+		omitted_count = total_items - self.max_history_items
+
+		# Show first item + omitted message + most recent (max_history_items - 1) items
+		# The omitted message doesn't count against the limit, only real history items do
+		recent_items_count = self.max_history_items - 1  # -1 for first item
+
+		items_to_include = [
+			self.state.agent_history_items[0].to_string(),  # Keep first item (initialization)
+			f'<sys>[... {omitted_count} previous steps omitted...]</sys>',
+		]
+		# Add most recent items
+		items_to_include.extend([item.to_string() for item in self.state.agent_history_items[-recent_items_count:]])
+
+		return '\n'.join(items_to_include)
+
 	def _init_messages(self) -> None:
 		"""Initialize the message history with system message, context, task, and other initial messages"""
-		self._add_message_with_type(self.system_prompt, message_type='init')
+		self._add_message_with_type(self.system_prompt)
 
 		placeholder_message = UserMessage(
 			content='<example_1>\nHere is an example output of thinking and tool call. You can use it as a reference but do not copy it exactly.',
 			cache=True,
 		)
 		# placeholder_message = HumanMessage(content='Example output:')
-		self._add_message_with_type(placeholder_message, message_type='init')
+		self._add_message_with_type(placeholder_message)
 
 		# Create base example content
 		example_content = {
@@ -173,18 +203,18 @@ After writing todo.md, I can also initialize a github.md file to accumulate the
 The file system actions do not change the browser state, so I can also click on the bytedance/UI-TARS-desktop (index [4]) to start collecting information."""
 
 		example_tool_call_1 = AssistantMessage(content=json.dumps(example_content), cache=True)
-		self._add_message_with_type(example_tool_call_1, message_type='init')
+		self._add_message_with_type(example_tool_call_1)
 		self._add_message_with_type(
 			UserMessage(
 				content='Data written to todo.md.\nData written to github.md.\nClicked element with index 4.\n</example_1>',
 				cache=True,
 			),
-			message_type='init',
 		)
 
 	def add_new_task(self, new_task: str) -> None:
 		self.task = new_task
-		self.state.agent_history_description += f'\n<s>User updated <user_request> to: {new_task}</s>\n'
+		task_update_item = HistoryItem(system_message=f'User updated <user_request> to: {new_task}')
+		self.state.agent_history_items.append(task_update_item)
 
 	def _update_agent_history_description(
 		self,
@@ -196,7 +226,7 @@ The file system actions do not change the browser state, so I can also click on
 
 		if result is None:
 			result = []
-		step_number = step_info.step_number if step_info else 'unknown'
+		step_number = step_info.step_number if step_info else None
 
 		self.state.read_state_description = ''
 
@@ -220,23 +250,23 @@ The file system actions do not change the browser state, so I can also click on
 
 		if action_results:
 			action_results = f'Action Results:\n{action_results}'
-		action_results = action_results.strip('\n')
+		action_results = action_results.strip('\n') if action_results else None
 
-		# Handle case where model_output is None (e.g., parsing failed)
+		# Build the history item
 		if model_output is None:
-			if isinstance(step_number, int) and step_number > 0:
-				self.state.agent_history_description += f"""<step_{step_number}>
-Agent failed to output in the right format.
-</step_{step_number}>
-"""
+			# Only add error history item if we have a valid step number
+			if step_number is not None and step_number > 0:
+				history_item = HistoryItem(step_number=step_number, error='Agent failed to output in the right format.')
+				self.state.agent_history_items.append(history_item)
 		else:
-			self.state.agent_history_description += f"""<step_{step_number}>
-Evaluation of Previous Step: {model_output.current_state.evaluation_previous_goal}
-Memory: {model_output.current_state.memory}
-Next Goal: {model_output.current_state.next_goal}
-{action_results}
-</step_{step_number}>
-"""
+			history_item = HistoryItem(
+				step_number=step_number,
+				evaluation_previous_goal=model_output.current_state.evaluation_previous_goal,
+				memory=model_output.current_state.memory,
+				next_goal=model_output.current_state.next_goal,
+				action_results=action_results,
+			)
+			self.state.agent_history_items.append(history_item)
 
 	def _get_sensitive_data_description(self, current_page_url) -> str:
 		sensitive_data = self.sensitive_data
@@ -284,7 +314,7 @@ Next Goal: {model_output.current_state.next_goal}
 		state_message = AgentMessagePrompt(
 			browser_state_summary=browser_state_summary,
 			file_system=self.file_system,
-			agent_history_description=self.state.agent_history_description,
+			agent_history_description=self.agent_history_description,
 			read_state_description=self.state.read_state_description,
 			task=self.task,
 			include_attributes=self.include_attributes,
@@ -346,16 +376,15 @@ Next Goal: {model_output.current_state.next_goal}
 
 		# Log message history for debugging
 		logger.debug(self._log_history_lines())
-		self.last_input_messages = [m.message for m in self.state.history.messages]
+		self.last_input_messages = list(self.state.history.messages)
 		return self.last_input_messages
 
 	def _add_message_with_type(
 		self,
 		message: BaseMessage,
 		position: int | None = None,
-		message_type: SupportedMessageTypes | None = None,
 	) -> None:
-		"""Add message with token count metadata
+		"""Add message to history
 		position: None for last, -1 for second last, etc.
 		"""
 
@@ -363,8 +392,7 @@ Next Goal: {model_output.current_state.next_goal}
 		if self.sensitive_data:
 			message = self._filter_sensitive_data(message)
 
-		metadata = MessageMetadata(message_type=message_type)
-		self.state.history.add_message(message, metadata, position)
+		self.state.history.add_message(message, position)
 
 	@time_execution_sync('--filter_sensitive_data')
 	def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage:
diff --git a/browser_use/agent/message_manager/views.py b/browser_use/agent/message_manager/views.py
index 9e605fa34..5e9d91e99 100644
--- a/browser_use/agent/message_manager/views.py
+++ b/browser_use/agent/message_manager/views.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING
 
 from pydantic import BaseModel, ConfigDict, Field
 
@@ -13,43 +13,81 @@ if TYPE_CHECKING:
 	pass
 
 
-SupportedMessageTypes = Literal['init', 'memory']
+class HistoryItem:
+	"""Represents a single agent history item with its data and string representation"""
 
+	def __init__(
+		self,
+		step_number: int | None = None,
+		evaluation_previous_goal: str | None = None,
+		memory: str | None = None,
+		next_goal: str | None = None,
+		action_results: str | None = None,
+		error: str | None = None,
+		system_message: str | None = None,
+	):
+		# Validate that error and system_message are not both provided
+		if error is not None and system_message is not None:
+			raise ValueError('Cannot have both error and system_message at the same time')
 
-class MessageMetadata(BaseModel):
-	"""Metadata for a message"""
+		self.step_number = step_number
+		self.evaluation_previous_goal = evaluation_previous_goal
+		self.memory = memory
+		self.next_goal = next_goal
+		self.action_results = action_results
+		self.error = error
+		self.system_message = system_message
 
-	message_type: SupportedMessageTypes | None = None
+	def to_string(self) -> str:
+		"""Get string representation of the history item"""
+		step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
 
+		if self.error:
+			return f"""<{step_str}>
+{self.error}
+</{step_str}>"""
+		elif self.system_message:
+			return f"""<sys>
+{self.system_message}
+</sys>"""
+		else:
+			content_parts = [
+				f'Evaluation of Previous Step: {self.evaluation_previous_goal}',
+				f'Memory: {self.memory}',
+				f'Next Goal: {self.next_goal}',
+			]
 
-class ManagedMessage(BaseModel):
-	"""A message with its metadata"""
+			if self.action_results:
+				content_parts.append(self.action_results)
 
-	message: BaseMessage
-	metadata: MessageMetadata = Field(default_factory=MessageMetadata)
+			content = '\n'.join(content_parts)
+
+			return f"""<{step_str}>
+{content}
+</{step_str}>"""
 
 
 class MessageHistory(BaseModel):
-	"""History of messages with metadata"""
+	"""History of messages"""
 
-	messages: list[ManagedMessage] = Field(default_factory=list)
+	messages: list[BaseMessage] = Field(default_factory=list)
 
 	model_config = ConfigDict(arbitrary_types_allowed=True)
 
-	def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
-		"""Add message with metadata to history"""
+	def add_message(self, message: BaseMessage, position: int | None = None) -> None:
+		"""Add message to history"""
 		if position is None:
-			self.messages.append(ManagedMessage(message=message, metadata=metadata))
+			self.messages.append(message)
 		else:
-			self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
+			self.messages.insert(position, message)
 
 	def get_messages(self) -> list[BaseMessage]:
 		"""Get all messages"""
-		return [m.message for m in self.messages]
+		return self.messages
 
 	def remove_last_state_message(self) -> None:
 		"""Remove last state message from history"""
-		if len(self.messages) > 2 and isinstance(self.messages[-1].message, UserMessage):
+		if len(self.messages) > 2 and isinstance(self.messages[-1], UserMessage):
 			self.messages.pop()
 
 
@@ -58,7 +96,9 @@ class MessageManagerState(BaseModel):
 
 	history: MessageHistory = Field(default_factory=MessageHistory)
 	tool_id: int = 1
-	agent_history_description: str = '<s>Agent initialized</s>\n'
+	agent_history_items: list[HistoryItem] = Field(
+		default_factory=lambda: [HistoryItem(step_number=0, system_message='Agent initialized')]
+	)
 	read_state_description: str = ''
 
 	model_config = ConfigDict(arbitrary_types_allowed=True)
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
index d5120923e..e78115fc4 100644
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -169,6 +169,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		],
 		max_actions_per_step: int = 10,
 		use_thinking: bool = True,
+		max_history_items: int = 40,
 		page_extraction_llm: BaseChatModel | None = None,
 		planner_llm: BaseChatModel | None = None,
 		planner_interval: int = 1,  # Run planner every N steps
@@ -235,12 +236,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			available_file_paths=available_file_paths,
 			include_attributes=include_attributes,
 			max_actions_per_step=max_actions_per_step,
+			use_thinking=use_thinking,
+			max_history_items=max_history_items,
 			page_extraction_llm=page_extraction_llm,
 			planner_llm=planner_llm,
 			planner_interval=planner_interval,
 			is_planner_reasoning=is_planner_reasoning,
 			extend_planner_system_message=extend_planner_system_message,
-			use_thinking=use_thinking,
 			calculate_cost=calculate_cost,
 		)
 
@@ -318,6 +320,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			include_attributes=self.settings.include_attributes,
 			message_context=self.settings.message_context,
 			sensitive_data=sensitive_data,
+			max_history_items=self.settings.max_history_items,
 		)
 
 		if isinstance(browser, BrowserSession):
diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index 6637facfc..5728328a5 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
 Action Results: Your actions and their results
 </step_{{step_number}}>
 
-and system messages wrapped in <s> tag.
+and system messages wrapped in <sys> tag.
 </agent_history>
 
 <user_request>
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index 36d2c8698..bd16670fe 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -34,7 +34,7 @@ Next Goal: Your goal for this step
 Action Results: Your actions and their results
 </step_{{step_number}}>
 
-and system messages wrapped in <s> tag.
+and system messages wrapped in <sys> tag.
 </agent_history>
 
 <user_request>
diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py
index c29482110..6b717d5c1 100644
--- a/browser_use/agent/views.py
+++ b/browser_use/agent/views.py
@@ -54,6 +54,7 @@ class AgentSettings(BaseModel):
 	]
 	max_actions_per_step: int = 10
 	use_thinking: bool = True
+	max_history_items: int = 40
 
 	page_extraction_llm: BaseChatModel | None = None
 	planner_llm: BaseChatModel | None = None

From 3b53e8b495786bca978b073f37b4d2b412151329 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 12:23:48 +0200
Subject: [PATCH 13/37] fix linter

---
 browser_use/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/browser_use/cli.py b/browser_use/cli.py
index 975ea8f25..17d5199bc 100644
--- a/browser_use/cli.py
+++ b/browser_use/cli.py
@@ -855,8 +855,8 @@ class BrowserUseApp(App):
 				# Extract original task(s)
 				original_tasks = []
 				for msg in message_history:
-					if hasattr(msg, 'message') and hasattr(msg.message, 'content'):
-						content = msg.message.content
+					if hasattr(msg, 'content'):
+						content = msg.content
 						if isinstance(content, str) and 'Your ultimate task is:' in content:
 							task_text = content.split('"""')[1].strip()
 							original_tasks.append(task_text)

From 63c0f5d0868a9e52c6c1ba4fdaaafd54d0125604 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 12:32:54 +0200
Subject: [PATCH 14/37] convert to pydantic

---
 browser_use/agent/message_manager/views.py | 35 +++++++++-------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/browser_use/agent/message_manager/views.py b/browser_use/agent/message_manager/views.py
index 5e9d91e99..351a938f2 100644
--- a/browser_use/agent/message_manager/views.py
+++ b/browser_use/agent/message_manager/views.py
@@ -13,30 +13,23 @@ if TYPE_CHECKING:
 	pass
 
 
-class HistoryItem:
+class HistoryItem(BaseModel):
 	"""Represents a single agent history item with its data and string representation"""
 
-	def __init__(
-		self,
-		step_number: int | None = None,
-		evaluation_previous_goal: str | None = None,
-		memory: str | None = None,
-		next_goal: str | None = None,
-		action_results: str | None = None,
-		error: str | None = None,
-		system_message: str | None = None,
-	):
-		# Validate that error and system_message are not both provided
-		if error is not None and system_message is not None:
-			raise ValueError('Cannot have both error and system_message at the same time')
+	step_number: int | None = None
+	evaluation_previous_goal: str | None = None
+	memory: str | None = None
+	next_goal: str | None = None
+	action_results: str | None = None
+	error: str | None = None
+	system_message: str | None = None
 
-		self.step_number = step_number
-		self.evaluation_previous_goal = evaluation_previous_goal
-		self.memory = memory
-		self.next_goal = next_goal
-		self.action_results = action_results
-		self.error = error
-		self.system_message = system_message
+	model_config = ConfigDict(arbitrary_types_allowed=True)
+
+	def model_post_init(self, __context) -> None:
+		"""Validate that error and system_message are not both provided"""
+		if self.error is not None and self.system_message is not None:
+			raise ValueError('Cannot have both error and system_message at the same time')
 
 	def to_string(self) -> str:
 		"""Get string representation of the history item"""

From b17e8f3f88f75e571d0ad738f44d508a61b0af60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 12:36:17 +0200
Subject: [PATCH 15/37] Max 3 min for judge

---
 eval/service.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/eval/service.py b/eval/service.py
index 5407ee76b..07eaf61db 100644
--- a/eval/service.py
+++ b/eval/service.py
@@ -1246,8 +1246,9 @@ async def judge_task_result(model, task_folder: Path, score_threshold: float = 3
 
 			try:
 				# Run comprehensive judge evaluation
-				comprehensive_result = await evaluate_task_with_comprehensive_judge(
-					task_folder=task_folder, model=model, max_images=10
+				comprehensive_result = await asyncio.wait_for(
+					evaluate_task_with_comprehensive_judge(task_folder=task_folder, model=model, max_images=10),
+					timeout=180,  # 3 minutes max for evaluation
 				)
 
 				if comprehensive_result.get('error'):

From 729fc798ca826c94158a09f3513d932789dfff56 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 12:48:58 +0200
Subject: [PATCH 16/37] add assertion for safety

---
 browser_use/agent/message_manager/service.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
index 504b842d7..b48d1aff1 100644
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -116,6 +116,8 @@ class MessageManager:
 		self.use_thinking = use_thinking
 		self.max_history_items = max_history_items
 
+		assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'
+
 		# Store settings as direct attributes instead of in a settings object
 		self.include_attributes = include_attributes or []
 		self.message_context = message_context

From 9376d9d91e8425aade92d4105a39417e813cfede Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 12:54:18 +0200
Subject: [PATCH 17/37] Update evaluation workflow to use new runner label
 'eval-4-core-500'

---
 .github/workflows/eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index 0587dd6b9..c52edc4cb 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -9,7 +9,7 @@ jobs:
   run_evaluation:
     runs-on: 
       group: eval
-      labels: ubuntu-latest-16-core
+      labels: eval-4-core-500
     timeout-minutes: 360
     env:
       IN_DOCKER: 'true'

From a9488feeadfa8a7bcdcbe06bab47a42add654397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Tue, 1 Jul 2025 12:56:36 +0200
Subject: [PATCH 18/37] Change to 2 core runners

---
 .github/workflows/eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index c52edc4cb..147983367 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -9,7 +9,7 @@ jobs:
   run_evaluation:
     runs-on: 
       group: eval
-      labels: eval-4-core-500
+      labels: eval-2-core-500
     timeout-minutes: 360
     env:
       IN_DOCKER: 'true'

From 4e7cf3a964dda4158e049f8f08411914c01bdaae Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 13:40:05 +0200
Subject: [PATCH 19/37] Update model names for gemini-2.5 to remove preview
 suffixes in service.py

---
 eval/service.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval/service.py b/eval/service.py
index 07eaf61db..eaad899c7 100644
--- a/eval/service.py
+++ b/eval/service.py
@@ -716,8 +716,8 @@ SUPPORTED_MODELS = {
 	'gemini-1.5-flash': {'provider': 'google', 'model_name': 'gemini-1.5-flash-latest', 'api_key_env': 'GEMINI_API_KEY'},
 	'gemini-2.0-flash-lite': {'provider': 'google', 'model_name': 'gemini-2.0-flash-lite', 'api_key_env': 'GEMINI_API_KEY'},
 	'gemini-2.0-flash': {'provider': 'google', 'model_name': 'gemini-2.0-flash', 'api_key_env': 'GEMINI_API_KEY'},
-	'gemini-2.5-pro': {'provider': 'google', 'model_name': 'gemini-2.5-pro-preview-03-25', 'api_key_env': 'GEMINI_API_KEY'},
-	'gemini-2.5-flash': {'provider': 'google', 'model_name': 'gemini-2.5-flash-latest', 'api_key_env': 'GEMINI_API_KEY'},
+	'gemini-2.5-pro': {'provider': 'google', 'model_name': 'gemini-2.5-pro', 'api_key_env': 'GEMINI_API_KEY'},
+	'gemini-2.5-flash': {'provider': 'google', 'model_name': 'gemini-2.5-flash', 'api_key_env': 'GEMINI_API_KEY'},
 	'gemini-2.5-pro-preview-05-06': {
 		'provider': 'google',
 		'model_name': 'gemini-2.5-pro-preview-05-06',

From 1885dabc484517ec714b511e9656afdea9f9e436 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 23:18:54 +0200
Subject: [PATCH 20/37] default reasoning effort to low

---
 browser_use/llm/openai/chat.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/browser_use/llm/openai/chat.py b/browser_use/llm/openai/chat.py
index fb4f83083..7cc299ee1 100644
--- a/browser_use/llm/openai/chat.py
+++ b/browser_use/llm/openai/chat.py
@@ -33,6 +33,7 @@ class ChatOpenAI(BaseChatModel):
 
 	# Model params
 	temperature: float | None = None
+	reasoning_effort: str | None = 'low'
 
 	# Client initialization parameters
 	api_key: str | None = None
@@ -135,7 +136,10 @@ class ChatOpenAI(BaseChatModel):
 			if output_format is None:
 				# Return string response
 				response = await self.get_client().chat.completions.create(
-					model=self.model, messages=openai_messages, temperature=self.temperature
+					model=self.model,
+					messages=openai_messages,
+					temperature=self.temperature,
+					reasoning_effort=self.reasoning_effort,
 				)
 
 				usage = self._get_usage(response)
@@ -156,6 +160,7 @@ class ChatOpenAI(BaseChatModel):
 					model=self.model,
 					messages=openai_messages,
 					temperature=self.temperature,
+					reasoning_effort=self.reasoning_effort,
 					response_format=ResponseFormatJSONSchema(json_schema=response_format, type='json_schema'),
 				)
 

From 6bdb4d59201c468bf69d4c903e0a9800329857a0 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 23:20:50 +0200
Subject: [PATCH 21/37] Refactor reasoning_effort parameter to be non-optional
 and default to 'low'

---
 browser_use/llm/openai/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/browser_use/llm/openai/chat.py b/browser_use/llm/openai/chat.py
index 7cc299ee1..8119b41c3 100644
--- a/browser_use/llm/openai/chat.py
+++ b/browser_use/llm/openai/chat.py
@@ -33,7 +33,7 @@ class ChatOpenAI(BaseChatModel):
 
 	# Model params
 	temperature: float | None = None
-	reasoning_effort: str | None = 'low'
+	reasoning_effort: str = 'low'
 
 	# Client initialization parameters
 	api_key: str | None = None

From b290e64b86f81cb83b2a185c1480cbe4fd0bf889 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Tue, 1 Jul 2025 23:24:20 +0200
Subject: [PATCH 22/37] fix types

---
 browser_use/llm/openai/chat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/browser_use/llm/openai/chat.py b/browser_use/llm/openai/chat.py
index 8119b41c3..1067c48f9 100644
--- a/browser_use/llm/openai/chat.py
+++ b/browser_use/llm/openai/chat.py
@@ -6,6 +6,7 @@ import httpx
 from openai import APIConnectionError, APIStatusError, AsyncOpenAI, RateLimitError
 from openai.types.chat.chat_completion import ChatCompletion
 from openai.types.shared.chat_model import ChatModel
+from openai.types.shared_params.reasoning_effort import ReasoningEffort
 from openai.types.shared_params.response_format_json_schema import JSONSchema, ResponseFormatJSONSchema
 from pydantic import BaseModel
 
@@ -33,7 +34,7 @@ class ChatOpenAI(BaseChatModel):
 
 	# Model params
 	temperature: float | None = None
-	reasoning_effort: str = 'low'
+	reasoning_effort: ReasoningEffort = 'low'
 
 	# Client initialization parameters
 	api_key: str | None = None

From 05ef6dcc82d48881ba19267054f0273add236ba7 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 1 Jul 2025 21:47:57 +0000
Subject: [PATCH 23/37] Replace asterisk markers with <new> tags for new
 elements

- Update DOM element highlighting from *[index]* to <new>[index]</new>
- Update system prompts to reflect new <new> tag format
- Provides clearer semantic meaning for new elements in browser state
---
 browser_use/agent/system_prompt.md             | 4 ++--
 browser_use/agent/system_prompt_no_thinking.md | 4 ++--
 browser_use/dom/views.py                       | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index 33819f357..6e1d272d2 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in
 
 Examples:
 [33]<div>User form</div>
-\t*[35]*<button aria-label='Submit form'>Submit</button>
+\t<new>[35]</new><button aria-label='Submit form'>Submit</button>
 
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with \* are new elements that were added after the previous step (if url has not changed)
+- Elements with <new> tags are new elements that were added after the previous step (if url has not changed)
 - Pure text elements without [] are not interactive.
 </browser_state>
 
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index 46ce35a39..b27294d19 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -56,12 +56,12 @@ Interactive Elements: All interactive elements will be provided in format as [in
 
 Examples:
 [33]<div>User form</div>
-\t*[35]*<button aria-label='Submit form'>Submit</button>
+\t<new>[35]</new><button aria-label='Submit form'>Submit</button>
 
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with \* are new elements that were added after the previous step (if url has not changed)
+- Elements with <new> tags are new elements that were added after the previous step (if url has not changed)
 - Pure text elements without [] are not interactive.
 </browser_state>
 
diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py
index 0964f37cc..c9b179e60 100644
--- a/browser_use/dom/views.py
+++ b/browser_use/dom/views.py
@@ -195,7 +195,7 @@ class DOMElementNode(DOMBaseNode):
 
 					# Build the line
 					if node.is_new:
-						highlight_indicator = f'*[{node.highlight_index}]*'
+						highlight_indicator = f'<new>[{node.highlight_index}]</new>'
 					else:
 						highlight_indicator = f'[{node.highlight_index}]'
 

From c24d5b4320ace6e51510b42378783a3ffd020108 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 1 Jul 2025 21:51:05 +0000
Subject: [PATCH 24/37] Remove optional xpath parameter from InputTextAction
 model

Co-authored-by: mamagnus00 <mamagnus00@gmail.com>
---
 browser_use/controller/views.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/browser_use/controller/views.py b/browser_use/controller/views.py
index fff18926a..83abadd4e 100644
--- a/browser_use/controller/views.py
+++ b/browser_use/controller/views.py
@@ -21,7 +21,6 @@ class ClickElementAction(BaseModel):
 class InputTextAction(BaseModel):
 	index: int
 	text: str
-	xpath: str | None = None
 
 
 class DoneAction(BaseModel):

From ee2c1d2ad0d2ccb03245d10ec1e1f0a7856e30e4 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 2 Jul 2025 07:43:21 +0000
Subject: [PATCH 25/37] Remove optional xpath from ClickElementAction model

Co-authored-by: mailmertunsal <mailmertunsal@gmail.com>
---
 browser_use/controller/views.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/browser_use/controller/views.py b/browser_use/controller/views.py
index 83abadd4e..50c4796ad 100644
--- a/browser_use/controller/views.py
+++ b/browser_use/controller/views.py
@@ -15,7 +15,6 @@ class GoToUrlAction(BaseModel):
 
 class ClickElementAction(BaseModel):
 	index: int
-	xpath: str | None = None
 
 
 class InputTextAction(BaseModel):

From fbd3a11737d61e3e2e8b0f0931de5f3fdf37b2f4 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Wed, 2 Jul 2025 10:01:56 +0200
Subject: [PATCH 26/37] fix

---
 browser_use/llm/openai/chat.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/browser_use/llm/openai/chat.py b/browser_use/llm/openai/chat.py
index 1067c48f9..e6747f70c 100644
--- a/browser_use/llm/openai/chat.py
+++ b/browser_use/llm/openai/chat.py
@@ -19,6 +19,8 @@ from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
 
 T = TypeVar('T', bound=BaseModel)
 
+ReasoningModels: list[ChatModel | str] = ['o4-mini', 'o3', 'o3-mini', 'o1', 'o1-pro', 'o3-pro']
+
 
 @dataclass
 class ChatOpenAI(BaseChatModel):
@@ -134,13 +136,14 @@ class ChatOpenAI(BaseChatModel):
 		openai_messages = OpenAIMessageSerializer.serialize_messages(messages)
 
 		try:
+			reasoning_effort = self.reasoning_effort if self.model in ReasoningModels else None
 			if output_format is None:
 				# Return string response
 				response = await self.get_client().chat.completions.create(
 					model=self.model,
 					messages=openai_messages,
 					temperature=self.temperature,
-					reasoning_effort=self.reasoning_effort,
+					reasoning_effort=reasoning_effort,
 				)
 
 				usage = self._get_usage(response)
@@ -161,7 +164,7 @@ class ChatOpenAI(BaseChatModel):
 					model=self.model,
 					messages=openai_messages,
 					temperature=self.temperature,
-					reasoning_effort=self.reasoning_effort,
+					reasoning_effort=reasoning_effort,
 					response_format=ResponseFormatJSONSchema(json_schema=response_format, type='json_schema'),
 				)
 

From b37f5294c58ccbe9cb308370bedebd2318c49d18 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Wed, 2 Jul 2025 10:08:11 +0200
Subject: [PATCH 27/37] fix

---
 browser_use/llm/openai/chat.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/browser_use/llm/openai/chat.py b/browser_use/llm/openai/chat.py
index e6747f70c..2b49ea20f 100644
--- a/browser_use/llm/openai/chat.py
+++ b/browser_use/llm/openai/chat.py
@@ -136,14 +136,19 @@ class ChatOpenAI(BaseChatModel):
 		openai_messages = OpenAIMessageSerializer.serialize_messages(messages)
 
 		try:
-			reasoning_effort = self.reasoning_effort if self.model in ReasoningModels else None
+			reasoning_effort_dict: dict = {}
+			if self.model in ReasoningModels:
+				reasoning_effort_dict = {
+					'reasoning_effort': self.reasoning_effort,
+				}
+
 			if output_format is None:
 				# Return string response
 				response = await self.get_client().chat.completions.create(
 					model=self.model,
 					messages=openai_messages,
 					temperature=self.temperature,
-					reasoning_effort=reasoning_effort,
+					**reasoning_effort_dict,
 				)
 
 				usage = self._get_usage(response)
@@ -164,8 +169,8 @@ class ChatOpenAI(BaseChatModel):
 					model=self.model,
 					messages=openai_messages,
 					temperature=self.temperature,
-					reasoning_effort=reasoning_effort,
 					response_format=ResponseFormatJSONSchema(json_schema=response_format, type='json_schema'),
+					**reasoning_effort_dict,
 				)
 
 				if response.choices[0].message.content is None:

From 4a0eab6fb8474396700ef71f0fd5e961b15b327a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Wed, 2 Jul 2025 10:36:41 +0200
Subject: [PATCH 28/37] test git automation functions

---
 eval/service.py  |   1 +
 git-functions.sh | 116 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)
 create mode 100755 git-functions.sh

diff --git a/eval/service.py b/eval/service.py
index eaad899c7..d13e9baa0 100644
--- a/eval/service.py
+++ b/eval/service.py
@@ -731,6 +731,7 @@ SUPPORTED_MODELS = {
 	# OpenAI
 	'gpt-4.1': {'provider': 'openai', 'model_name': 'gpt-4.1-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4.1-mini': {'provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
+	'gpt-o3': {'provider': 'openai', 'model_name': 'o3-2025-04-16', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4.1-nano': {'provider': 'openai', 'model_name': 'gpt-4.1-nano-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4o': {'provider': 'openai', 'model_name': 'gpt-4o', 'api_key_env': 'OPENAI_API_KEY'},
 	'gpt-4o-mini': {'provider': 'openai', 'model_name': 'gpt-4o-mini', 'api_key_env': 'OPENAI_API_KEY'},
diff --git a/git-functions.sh b/git-functions.sh
new file mode 100755
index 000000000..d0f2ee377
--- /dev/null
+++ b/git-functions.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# bcpp: Branch + Commit + Push + PR + Open
+bcpp() {
+    if [ $# -eq 0 ]; then
+        echo "❌ Usage: bcpp <commit message>"
+        echo "📝 Example: bcpp hello world"
+        return 1
+    fi
+    
+    # Check if gh CLI is installed
+    if ! command -v gh &> /dev/null; then
+        echo "❌ GitHub CLI (gh) is required. Install: https://cli.github.com/"
+        return 1
+    fi
+    
+    local commit_msg="$*"
+    local branch_name=$(echo "$commit_msg" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g' | sed 's/[^a-z0-9-]//g')
+    
+    # Check if branch already exists
+    if git show-ref --verify --quiet refs/heads/"$branch_name"; then
+        echo "❌ Branch '$branch_name' already exists!"
+        echo "💡 Try a different commit message or delete the existing branch"
+        return 1
+    fi
+    
+    echo "🌿 Creating branch: $branch_name"
+    git checkout -b "$branch_name" || return 1
+    
+    echo "📝 Adding all changes..."
+    git add . || return 1
+    
+    # Check if there are changes to commit
+    if git diff --staged --quiet; then
+        echo "ℹ️  No changes to commit"
+        git checkout - && git branch -d "$branch_name"
+        return 1
+    fi
+    
+    echo "💾 Committing: $commit_msg"
+    git commit -m "$commit_msg" || return 1
+    
+    echo "🚀 Pushing branch..."
+    git push -u origin "$branch_name" || return 1
+    
+    echo "�� Creating PR..."
+    gh pr create --title "$commit_msg" --body "Auto-generated PR for: $commit_msg" --head "$branch_name" || return 1
+    
+    echo "🌐 Opening PR in browser..."
+    gh pr view --web
+    
+    echo "✅ Done! Branch: $branch_name"
+}
+
+# cpp: Commit + Push + PR (no new branch)
+cpp() {
+    if [ $# -eq 0 ]; then
+        echo "❌ Usage: cpp <commit message>"
+        echo "📝 Example: cpp hello world"
+        return 1
+    fi
+    
+    local commit_msg="$*"
+    local current_branch=$(git branch --show-current)
+    
+    echo "📝 Adding all changes..."
+    git add . || return 1
+    
+    # Check if there are changes to commit
+    if git diff --staged --quiet; then
+        echo "ℹ️  No changes to commit"
+        return 1
+    fi
+    
+    echo "💾 Committing to $current_branch: $commit_msg"
+    git commit -m "$commit_msg" || return 1
+    
+    echo "🚀 Pushing to $current_branch..."
+    git push || return 1
+    
+    # Only create PR if not on main/master and gh CLI is available
+    if [[ "$current_branch" != "main" && "$current_branch" != "master" ]] && command -v gh &> /dev/null; then
+        echo "🔄 Creating/updating PR from $current_branch..."
+        
+        # Check if PR already exists
+        if gh pr view --json number &> /dev/null; then
+            echo "ℹ️  PR already exists, updated with new commit"
+            gh pr view --web
+        else
+            gh pr create --title "$commit_msg" --body "Auto-generated PR for: $commit_msg" --head "$current_branch"
+            echo "🌐 Opening PR in browser..."
+            gh pr view --web
+        fi
+    else
+        echo "ℹ️  On main branch or gh CLI not available, skipping PR creation"
+    fi
+    
+    echo "✅ Done!"
+}
+
+# Quick help function
+git-help() {
+    echo "🚀 Git Automation Functions:"
+    echo ""
+    echo "  bcpp <message>  - Branch + Commit + Push + PR"
+    echo "                    Creates new branch, commits, pushes, creates PR"
+    echo "                    Example: bcpp fix login bug"
+    echo ""
+    echo "  cpp <message>   - Commit + Push + PR (current branch)"
+    echo "                    Commits to current branch, pushes, creates/updates PR"
+    echo "                    Example: cpp update documentation"
+    echo ""
+    echo "💡 Both functions automatically open the PR in your browser"
+}
+
+echo "🎉 Git automation functions loaded! Type 'git-help' for usage info."

From 8880a2db37dde6e5694488a454f8e8b50851c742 Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Wed, 2 Jul 2025 10:40:58 +0200
Subject: [PATCH 29/37] add sensitive data example

---
 examples/features/sensitive_data.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/features/sensitive_data.py b/examples/features/sensitive_data.py
index 33206cc7b..76184a457 100644
--- a/examples/features/sensitive_data.py
+++ b/examples/features/sensitive_data.py
@@ -12,9 +12,16 @@ from browser_use import Agent
 from browser_use.browser import BrowserProfile
 from browser_use.llm import ChatOpenAI
 
+try:
+	from lmnr import Laminar
+
+	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
+except Exception as e:
+	print(f'Error initializing Laminar: {e}')
+
 # Initialize the model
 llm = ChatOpenAI(
-	model='gpt-4o',
+	model='gpt-4.1',
 	temperature=0.0,
 )
 # Simple case: the model will see x_name and x_password, but never the actual values.
@@ -35,7 +42,7 @@ sensitive_data: dict[str, str | dict[str, str]] = {
 	'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
 }
 # Update task to use one of the credentials above
-task = 'Go to example.com and login with company_username and company_password'
+task = 'Go to google.com and put the login information in the search bar.'
 
 # Always set allowed_domains when using sensitive_data for security
 from browser_use.browser.session import BrowserSession

From 18db8926bd6290de6ef6bf1f21a85a0dbca0a5be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Wed, 2 Jul 2025 10:44:24 +0200
Subject: [PATCH 30/37] remove git function helpers

---
 git-functions.sh | 116 -----------------------------------------------
 1 file changed, 116 deletions(-)
 delete mode 100755 git-functions.sh

diff --git a/git-functions.sh b/git-functions.sh
deleted file mode 100755
index d0f2ee377..000000000
--- a/git-functions.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/bash
-
-# bcpp: Branch + Commit + Push + PR + Open
-bcpp() {
-    if [ $# -eq 0 ]; then
-        echo "❌ Usage: bcpp <commit message>"
-        echo "📝 Example: bcpp hello world"
-        return 1
-    fi
-    
-    # Check if gh CLI is installed
-    if ! command -v gh &> /dev/null; then
-        echo "❌ GitHub CLI (gh) is required. Install: https://cli.github.com/"
-        return 1
-    fi
-    
-    local commit_msg="$*"
-    local branch_name=$(echo "$commit_msg" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g' | sed 's/[^a-z0-9-]//g')
-    
-    # Check if branch already exists
-    if git show-ref --verify --quiet refs/heads/"$branch_name"; then
-        echo "❌ Branch '$branch_name' already exists!"
-        echo "💡 Try a different commit message or delete the existing branch"
-        return 1
-    fi
-    
-    echo "🌿 Creating branch: $branch_name"
-    git checkout -b "$branch_name" || return 1
-    
-    echo "📝 Adding all changes..."
-    git add . || return 1
-    
-    # Check if there are changes to commit
-    if git diff --staged --quiet; then
-        echo "ℹ️  No changes to commit"
-        git checkout - && git branch -d "$branch_name"
-        return 1
-    fi
-    
-    echo "💾 Committing: $commit_msg"
-    git commit -m "$commit_msg" || return 1
-    
-    echo "🚀 Pushing branch..."
-    git push -u origin "$branch_name" || return 1
-    
-    echo "�� Creating PR..."
-    gh pr create --title "$commit_msg" --body "Auto-generated PR for: $commit_msg" --head "$branch_name" || return 1
-    
-    echo "🌐 Opening PR in browser..."
-    gh pr view --web
-    
-    echo "✅ Done! Branch: $branch_name"
-}
-
-# cpp: Commit + Push + PR (no new branch)
-cpp() {
-    if [ $# -eq 0 ]; then
-        echo "❌ Usage: cpp <commit message>"
-        echo "📝 Example: cpp hello world"
-        return 1
-    fi
-    
-    local commit_msg="$*"
-    local current_branch=$(git branch --show-current)
-    
-    echo "📝 Adding all changes..."
-    git add . || return 1
-    
-    # Check if there are changes to commit
-    if git diff --staged --quiet; then
-        echo "ℹ️  No changes to commit"
-        return 1
-    fi
-    
-    echo "💾 Committing to $current_branch: $commit_msg"
-    git commit -m "$commit_msg" || return 1
-    
-    echo "🚀 Pushing to $current_branch..."
-    git push || return 1
-    
-    # Only create PR if not on main/master and gh CLI is available
-    if [[ "$current_branch" != "main" && "$current_branch" != "master" ]] && command -v gh &> /dev/null; then
-        echo "🔄 Creating/updating PR from $current_branch..."
-        
-        # Check if PR already exists
-        if gh pr view --json number &> /dev/null; then
-            echo "ℹ️  PR already exists, updated with new commit"
-            gh pr view --web
-        else
-            gh pr create --title "$commit_msg" --body "Auto-generated PR for: $commit_msg" --head "$current_branch"
-            echo "🌐 Opening PR in browser..."
-            gh pr view --web
-        fi
-    else
-        echo "ℹ️  On main branch or gh CLI not available, skipping PR creation"
-    fi
-    
-    echo "✅ Done!"
-}
-
-# Quick help function
-git-help() {
-    echo "🚀 Git Automation Functions:"
-    echo ""
-    echo "  bcpp <message>  - Branch + Commit + Push + PR"
-    echo "                    Creates new branch, commits, pushes, creates PR"
-    echo "                    Example: bcpp fix login bug"
-    echo ""
-    echo "  cpp <message>   - Commit + Push + PR (current branch)"
-    echo "                    Commits to current branch, pushes, creates/updates PR"
-    echo "                    Example: cpp update documentation"
-    echo ""
-    echo "💡 Both functions automatically open the PR in your browser"
-}
-
-echo "🎉 Git automation functions loaded! Type 'git-help' for usage info."

From 695171f90b1861761e9301d9a577e630c139d17a Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 2 Jul 2025 09:00:45 +0000
Subject: [PATCH 31/37] Remove mem0ai dependency from project requirements

Co-authored-by: mailmertunsal <mailmertunsal@gmail.com>
---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f990f3056..ba63f8640 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
     "google-api-core>=2.25.0",
     "httpx>=0.28.1",
     "markdownify==1.1.0",
-    "mem0ai>=0.1.106",
     "patchright>=1.52.5",
     "playwright>=1.52.0",
     "portalocker>=2.7.0,<3.0.0",

From 3a5f43bb3fa8a9af0548fb35a2ea69b58c12091a Mon Sep 17 00:00:00 2001
From: mertunsall <mertunsal1905@gmail.com>
Date: Wed, 2 Jul 2025 11:07:58 +0200
Subject: [PATCH 32/37] eval should run with temperature 1 for o3

---
 eval/service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval/service.py b/eval/service.py
index d13e9baa0..ae1c3d137 100644
--- a/eval/service.py
+++ b/eval/service.py
@@ -882,7 +882,7 @@ def get_llm(model_name: str):
 		case 'openai':
 			kwargs = {'model': config['model_name'], 'temperature': 0.0}
 			# Must set temperatue=1 if model is gpt-o4-mini
-			if model_name == 'gpt-o4-mini':
+			if model_name in ['gpt-o4-mini', 'gpt-o3']:
 				kwargs['temperature'] = 1
 			if api_key:
 				kwargs['api_key'] = api_key

From e13c4cee9cdec232d829bdb503263d93973848ac Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 2 Jul 2025 09:22:12 +0000
Subject: [PATCH 33/37] Clarify definition of <new> tag in browser state
 documentation

Co-authored-by: mailmertunsal <mailmertunsal@gmail.com>
---
 browser_use/agent/system_prompt.md             | 2 +-
 browser_use/agent/system_prompt_no_thinking.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index 6e1d272d2..23bc83a92 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with <new> tags are new elements that were added after the previous step (if url has not changed)
+- Elements with <new> tags are elements that appeared on the website since the last step - an element is only tagged as "new" if it was not present in the previous step but is now visible on the page (if url has not changed)
 - Pure text elements without [] are not interactive.
 </browser_state>
 
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index b27294d19..aede9a858 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with <new> tags are new elements that were added after the previous step (if url has not changed)
+- Elements with <new> tags are elements that appeared on the website since the last step - an element is only tagged as "new" if it was not present in the previous step but is now visible on the page (if url has not changed)
 - Pure text elements without [] are not interactive.
 </browser_state>
 

From 755b96e6a0980f6a3e6b2c88976954c64dee0b9e Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 2 Jul 2025 09:23:26 +0000
Subject: [PATCH 34/37] Simplify note about <new> tag in browser state
 description

Co-authored-by: mailmertunsal <mailmertunsal@gmail.com>
---
 browser_use/agent/system_prompt.md             | 2 +-
 browser_use/agent/system_prompt_no_thinking.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index 23bc83a92..c7d8480e6 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with <new> tags are elements that appeared on the website since the last step - an element is only tagged as "new" if it was not present in the previous step but is now visible on the page (if url has not changed)
+- Elements with <new> tags appeared on the website since the last step (if url has not changed)
 - Pure text elements without [] are not interactive.
 </browser_state>
 
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index aede9a858..3a6c9070d 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with <new> tags are elements that appeared on the website since the last step - an element is only tagged as "new" if it was not present in the previous step but is now visible on the page (if url has not changed)
+- Elements with <new> tags appeared on the website since the last step (if url has not changed)
 - Pure text elements without [] are not interactive.
 </browser_state>
 

From fd5bb775dcddfd2b51fd0f89dac610770afc7838 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 2 Jul 2025 09:25:07 +0000
Subject: [PATCH 35/37] Clarify description of <new> elements in browser state
 documentation

Co-authored-by: mailmertunsal <mailmertunsal@gmail.com>
---
 browser_use/agent/system_prompt.md             | 2 +-
 browser_use/agent/system_prompt_no_thinking.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index c7d8480e6..f45d7d10f 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with <new> tags appeared on the website since the last step (if url has not changed)
+- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed
 - Pure text elements without [] are not interactive.
 </browser_state>
 
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index 3a6c9070d..be8e94785 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements with <new> tags appeared on the website since the last step (if url has not changed)
+- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed
 - Pure text elements without [] are not interactive.
 </browser_state>
 

From 9dc85ee203e9206f99655e453e244c563caa1bfe Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 2 Jul 2025 09:25:18 +0000
Subject: [PATCH 36/37] Checkpoint before follow-up message

---
 browser_use/agent/system_prompt.md             | 2 +-
 browser_use/agent/system_prompt_no_thinking.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index f45d7d10f..08feb8cfa 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed
+- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed.
 - Pure text elements without [] are not interactive.
 </browser_state>
 
diff --git a/browser_use/agent/system_prompt_no_thinking.md b/browser_use/agent/system_prompt_no_thinking.md
index be8e94785..22d066d8a 100644
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -61,7 +61,7 @@ Examples:
 Note that:
 - Only elements with numeric indexes in [] are interactive
 - (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
-- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed
+- Elements tagged with <new> are the new clickable elements that appeared on the website since the last step - if url has not changed.
 - Pure text elements without [] are not interactive.
 </browser_state>
 

From 239fd3f86b3504538379093913875c30587814f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?=
 <67061560+MagMueller@users.noreply.github.com>
Date: Wed, 2 Jul 2025 14:10:22 +0200
Subject: [PATCH 37/37] eval include runner link

---
 .github/workflows/eval.yaml | 10 ++++++++++
 eval/service.py             | 21 +++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index 147983367..0ce5a61e2 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -106,6 +106,13 @@ jobs:
           ps aux | wc -l
           echo "================================="
 
+      - name: Construct GitHub Workflow URL
+        id: github_url
+        run: |
+          GITHUB_WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          echo "GITHUB_WORKFLOW_URL=$GITHUB_WORKFLOW_URL" >> $GITHUB_OUTPUT
+          echo "::notice title=Workflow URL::Workflow URL: $GITHUB_WORKFLOW_URL"
+
       - name: Construct eval command
         id: eval_command
         run: |
@@ -218,6 +225,9 @@ jobs:
           [[ -n "$TASK_TEXT" ]] && CMD_ARGS+=("--task-text" "$TASK_TEXT")
           [[ -n "$TASK_WEBSITE" ]] && CMD_ARGS+=("--task-website" "$TASK_WEBSITE")
 
+          # Add GitHub workflow URL
+          [[ -n "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}" ]] && CMD_ARGS+=("--github-workflow-url" "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}")
+
           # Convert array to command string with proper escaping
           printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"
 
diff --git a/eval/service.py b/eval/service.py
index ae1c3d137..862acef10 100644
--- a/eval/service.py
+++ b/eval/service.py
@@ -535,6 +535,7 @@ class TaskResult:
 	task: Any
 	max_steps: int
 	laminar_link: str | None = None
+	github_workflow_url: str | None = None
 	completed_stages: set[Stage] = field(default_factory=set)
 	stage_data: dict[Stage, Any] = field(default_factory=dict)
 	errors: list = field(default_factory=list)
@@ -576,6 +577,7 @@ class TaskResult:
 			'critical_error': self.critical_error,
 			'server_save_failed': self.server_save_failed,
 			'laminarTaskLink': self.laminar_link,
+			'githubWorkflowUrl': self.github_workflow_url,
 		}
 
 		# Add task execution data if available
@@ -1582,6 +1584,7 @@ async def run_task_with_semaphore(
 	headless: bool,
 	use_vision: bool,
 	semaphore_runs: asyncio.Semaphore,  # Pass semaphore as argument
+	github_workflow_url: str | None = None,
 	use_serp: bool = False,
 	enable_memory: bool = False,
 	memory_interval: int = 10,
@@ -1653,7 +1656,9 @@ async def run_task_with_semaphore(
 				logger.debug(f'Task {task.task_id}: No Laminar run ID available, skipping datapoint creation')
 
 				# Initialize task result and basic setup
-			task_result = TaskResult(task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link)
+			task_result = TaskResult(
+				task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link, github_workflow_url
+			)
 
 			task_folder = Path(f'saved_trajectories/{task.task_id}')
 
@@ -1847,7 +1852,13 @@ async def run_task_with_semaphore(
 				# Create minimal task result for server reporting
 				try:
 					task_result = TaskResult(
-						task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link
+						task.task_id,
+						run_id,
+						task.confirmed_task,
+						task,
+						max_steps_per_task,
+						laminar_task_link,
+						github_workflow_url,
 					)
 					task_result.mark_critical_error(f'Initialization failed: {str(init_error)}')
 				except Exception as result_error:
@@ -1908,6 +1919,7 @@ async def run_multiple_tasks(
 	convex_url: str,
 	secret_key: str,
 	eval_model: BaseChatModel,
+	github_workflow_url: str | None = None,
 	max_parallel_runs: int = 3,
 	max_steps_per_task: int = 25,
 	start_index: int = 0,
@@ -1992,6 +2004,7 @@ async def run_multiple_tasks(
 					headless=headless,
 					use_vision=use_vision,
 					semaphore_runs=semaphore_runs,  # Pass the semaphore
+					github_workflow_url=github_workflow_url,
 					use_serp=use_serp,
 					enable_memory=enable_memory,
 					memory_interval=memory_interval,
@@ -2254,6 +2267,7 @@ async def run_evaluation_pipeline(
 	convex_url: str,
 	secret_key: str,
 	eval_model: BaseChatModel,
+	github_workflow_url: str | None = None,
 	max_parallel_runs: int = 3,
 	max_steps_per_task: int = 25,
 	start_index: int = 0,
@@ -2306,6 +2320,7 @@ async def run_evaluation_pipeline(
 		convex_url=convex_url,
 		secret_key=secret_key,
 		eval_model=eval_model,
+		github_workflow_url=github_workflow_url,
 		max_parallel_runs=max_parallel_runs,
 		max_steps_per_task=max_steps_per_task,
 		start_index=start_index,
@@ -2388,6 +2403,7 @@ if __name__ == '__main__':
 	parser.add_argument('--use-mind2web-judge', action='store_true', help='Use original judge')
 	parser.add_argument('--no-thinking', action='store_true', help='Disable thinking in agent system prompt')
 	parser.add_argument('--use-anchor', action='store_true', help='Use anchor to navigate to the page')
+	parser.add_argument('--github-workflow-url', type=str, default=None, help='GitHub workflow URL for tracking')
 
 	# Single task mode arguments
 	parser.add_argument('--task-text', type=str, default=None, help='Task description for single task mode')
@@ -2615,6 +2631,7 @@ if __name__ == '__main__':
 				convex_url=convex_url,
 				secret_key=secret_key,
 				eval_model=eval_model,
+				github_workflow_url=args.github_workflow_url,
 				max_parallel_runs=parallel_runs,
 				max_steps_per_task=args.max_steps,
 				start_index=start_index,