Updates scroll functionality to use page units

Replaces pixel-based scrolling with page unit determination, allowing for more intuitive scroll actions by specifying the number of pages. Adjusts related documentation, examples, and tests to reflect this change for improved code consistency and user experience.
2026-05-06 17:52:15 +02:00 · 2025-07-07 18:21:37 +02:00
parent b5d0d6577c
commit 539274a7d4
6 changed files with 39 additions and 28 deletions
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -77,7 +77,7 @@ Strictly follow these rules while using the browser and navigating the web:
 - If research is needed, open a **new tab** instead of reusing the current one.
 - If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
 - By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. The extract content action gets the full loaded page content.
- You can scroll by a specific amount of pixels using the amount parameter, or scroll by one page height if no amount is specified.
+- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages), or scroll by one page if no num_pages is specified.
 - If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
 - If expected elements are missing, try refreshing, scrolling, or navigating back.
 - If the page is not fully loaded, use the wait action.
--- a/browser_use/agent/system_prompt_no_thinking.md
+++ b/browser_use/agent/system_prompt_no_thinking.md
@@ -77,7 +77,7 @@ Strictly follow these rules while using the browser and navigating the web:
 - If research is needed, open a **new tab** instead of reusing the current one.
 - If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
 - By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. The extract content action gets the full loaded page content.
- You can scroll by a specific amount of pixels using the amount parameter, or scroll by one page height if no amount is specified.
+- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages), or scroll by one page if no num_pages is specified.
 - If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
 - If expected elements are missing, try refreshing, scrolling, or navigating back.
 - If the page is not fully loaded, use the wait action.
--- a/browser_use/controller/service.py
+++ b/browser_use/controller/service.py
@@ -462,7 +462,7 @@ Explain the content of the page and that the requested information is not availa
 		# 	)

 		@self.registry.action(
-			'Scroll the page by specified amount in pixels (set down=True to scroll down, down=False to scroll up, amount=pixels to scroll or None for one page)',
+			'Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 1.0 for one page, etc. or None for one page)',
 			param_model=ScrollAction,
 		)
 		async def scroll(params: ScrollAction, browser_session: BrowserSession):
@@ -472,17 +472,22 @@ Explain the content of the page and that the requested information is not availa
 			"""
 			page = await browser_session.get_current_page()

-			# Determine scroll amount
-			if params.amount is not None:
-				scroll_amount = params.amount
+			# Get window height with retries
+			dy_result, action_result = await retry_async_function(
+				lambda: page.evaluate('() => window.innerHeight'), 'Scroll failed due to an error.'
+			)
+			if action_result:
+				return action_result
+			window_height = dy_result or 0
+
+			# Determine scroll amount based on num_pages
+			if params.num_pages is not None:
+				scroll_amount = int(window_height * params.num_pages)
+				pages_scrolled = params.num_pages
 			else:
-				# Get window height with retries (default behavior)
-				dy_result, action_result = await retry_async_function(
-					lambda: page.evaluate('() => window.innerHeight'), 'Scroll failed due to an error.'
-				)
-				if action_result:
-					return action_result
-				scroll_amount = dy_result or 0
+				# Default to one page
+				scroll_amount = window_height
+				pages_scrolled = 1.0

 			# Set direction based on down parameter
 			dy = scroll_amount if params.down else -scroll_amount
@@ -495,9 +500,13 @@ Explain the content of the page and that the requested information is not availa
 				logger.debug('Smart scroll failed; used window.scrollBy fallback', exc_info=e)

 			direction = 'down' if params.down else 'up'
-			if params.amount is not None:
-				msg = f'🔍 Scrolled {direction} the page by {params.amount} pixels'
-				long_term_memory = f'Scrolled {direction} the page by {params.amount} pixels'
+			if params.num_pages is not None:
+				if pages_scrolled == 1.0:
+					msg = f'🔍 Scrolled {direction} the page by one page'
+					long_term_memory = f'Scrolled {direction} the page by one page'
+				else:
+					msg = f'🔍 Scrolled {direction} the page by {pages_scrolled} pages'
+					long_term_memory = f'Scrolled {direction} the page by {pages_scrolled} pages'
 			else:
 				msg = f'🔍 Scrolled {direction} the page by one page'
 				long_term_memory = f'Scrolled {direction} the page by one page'
--- a/browser_use/controller/views.py
+++ b/browser_use/controller/views.py
@@ -46,7 +46,9 @@ class CloseTabAction(BaseModel):

 class ScrollAction(BaseModel):
 	down: bool  # True to scroll down, False to scroll up
-	amount: int | None = None  # Number of pixels to scroll. If None, defaults to one page height
+	num_pages: float | None = (
+		None  # Number of pages to scroll (0.5 = half page, 1.0 = one page, etc.). If None, defaults to one page
+	)


 class SendKeysAction(BaseModel):
--- a/examples/use-cases/scrolling_page.py
+++ b/examples/use-cases/scrolling_page.py
@@ -18,12 +18,12 @@ if not os.getenv('OPENAI_API_KEY'):
 	raise ValueError('OPENAI_API_KEY is not set')

 """
-Example: Using the 'Scroll' action with custom amounts.
+Example: Using the 'Scroll' action with custom page amounts.

-This script demonstrates how the agent can navigate to a webpage and scroll by specific amounts.
+This script demonstrates how the agent can navigate to a webpage and scroll by specific page amounts.
 The scroll action now supports:
- Scrolling by a specific number of pixels using the 'amount' parameter
- Scrolling by one page height if no amount is specified (default behavior)
+- Scrolling by a specific number of pages using the 'num_pages' parameter (e.g., 0.5 for half page, 1.0 for one page, 2.0 for two pages)
+- Scrolling by one page height if no num_pages is specified (default behavior)
 - Scrolling up or down using the 'down' parameter
 """

@@ -33,7 +33,7 @@ browser_profile = BrowserProfile(headless=False)
 browser_session = BrowserSession(browser_profile=browser_profile)

 agent = Agent(
-	task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 100 pixels - then scroll down by 100 pixels - then scroll down by 10000 pixels.",
+	task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 0.5 pages - then scroll down by 0.25 pages - then scroll down by 2 pages.",
 	# Alternative task to demonstrate text-based scrolling:
 	# task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll to the string 'The vast majority of computer'",
 	llm=llm,
--- a/tests/ci/test_controller.py
+++ b/tests/ci/test_controller.py
@@ -144,8 +144,8 @@ class TestControllerIntegration:
 		assert result.extracted_content is not None
 		assert 'Scrolled down' in result.extracted_content

-		# Test 2: Custom scroll amount up
-		scroll_up_action = {'scroll': ScrollAction(down=False, amount=250)}
+		# Test 2: Custom scroll num_pages up (quarter page)
+		scroll_up_action = {'scroll': ScrollAction(down=False, num_pages=0.25)}

 		class ScrollUpActionModel(ActionModel):
 			scroll: ScrollAction | None = None
@@ -157,10 +157,10 @@ class TestControllerIntegration:
 		assert isinstance(result, ActionResult)
 		assert result.extracted_content is not None
 		assert 'Scrolled up' in result.extracted_content
-		assert '250 pixels' in result.extracted_content
+		assert '0.25 pages' in result.extracted_content

-		# Test 3: Custom scroll amount down
-		scroll_custom_action = {'scroll': ScrollAction(down=True, amount=500)}
+		# Test 3: Custom scroll num_pages down (half page)
+		scroll_custom_action = {'scroll': ScrollAction(down=True, num_pages=0.5)}

 		class ScrollCustomActionModel(ActionModel):
 			scroll: ScrollAction | None = None
@@ -172,7 +172,7 @@ class TestControllerIntegration:
 		assert isinstance(result, ActionResult)
 		assert result.extracted_content is not None
 		assert 'Scrolled down' in result.extracted_content
-		assert '500 pixels' in result.extracted_content
+		assert '0.5 pages' in result.extracted_content

 	async def test_registry_actions(self, controller, browser_session):
 		"""Test that the registry contains the expected default actions."""