Updates scroll functionality to use page units

Replaces pixel-based scrolling with page unit determination, allowing for more intuitive scroll actions by specifying the number of pages. Adjusts related documentation, examples, and tests to reflect this change for improved code consistency and user experience.
This commit is contained in:
Daniel T.
2025-07-07 18:21:37 +02:00
parent b5d0d6577c
commit 539274a7d4
6 changed files with 39 additions and 28 deletions

View File

@@ -77,7 +77,7 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. The extract content action gets the full loaded page content.
- You can scroll by a specific amount of pixels using the amount parameter, or scroll by one page height if no amount is specified.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages), or scroll by one page if no num_pages is specified.
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.

View File

@@ -77,7 +77,7 @@ Strictly follow these rules while using the browser and navigating the web:
- If research is needed, open a **new tab** instead of reusing the current one.
- If the page changes after, for example, an input text action, analyse if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. The extract content action gets the full loaded page content.
- You can scroll by a specific amount of pixels using the amount parameter, or scroll by one page height if no amount is specified.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages), or scroll by one page if no num_pages is specified.
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.

View File

@@ -462,7 +462,7 @@ Explain the content of the page and that the requested information is not availa
# )
@self.registry.action(
'Scroll the page by specified amount in pixels (set down=True to scroll down, down=False to scroll up, amount=pixels to scroll or None for one page)',
'Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 1.0 for one page, etc. or None for one page)',
param_model=ScrollAction,
)
async def scroll(params: ScrollAction, browser_session: BrowserSession):
@@ -472,17 +472,22 @@ Explain the content of the page and that the requested information is not availa
"""
page = await browser_session.get_current_page()
# Determine scroll amount
if params.amount is not None:
scroll_amount = params.amount
# Get window height with retries
dy_result, action_result = await retry_async_function(
lambda: page.evaluate('() => window.innerHeight'), 'Scroll failed due to an error.'
)
if action_result:
return action_result
window_height = dy_result or 0
# Determine scroll amount based on num_pages
if params.num_pages is not None:
scroll_amount = int(window_height * params.num_pages)
pages_scrolled = params.num_pages
else:
# Get window height with retries (default behavior)
dy_result, action_result = await retry_async_function(
lambda: page.evaluate('() => window.innerHeight'), 'Scroll failed due to an error.'
)
if action_result:
return action_result
scroll_amount = dy_result or 0
# Default to one page
scroll_amount = window_height
pages_scrolled = 1.0
# Set direction based on down parameter
dy = scroll_amount if params.down else -scroll_amount
@@ -495,9 +500,13 @@ Explain the content of the page and that the requested information is not availa
logger.debug('Smart scroll failed; used window.scrollBy fallback', exc_info=e)
direction = 'down' if params.down else 'up'
if params.amount is not None:
msg = f'🔍 Scrolled {direction} the page by {params.amount} pixels'
long_term_memory = f'Scrolled {direction} the page by {params.amount} pixels'
if params.num_pages is not None:
if pages_scrolled == 1.0:
msg = f'🔍 Scrolled {direction} the page by one page'
long_term_memory = f'Scrolled {direction} the page by one page'
else:
msg = f'🔍 Scrolled {direction} the page by {pages_scrolled} pages'
long_term_memory = f'Scrolled {direction} the page by {pages_scrolled} pages'
else:
msg = f'🔍 Scrolled {direction} the page by one page'
long_term_memory = f'Scrolled {direction} the page by one page'

View File

@@ -46,7 +46,9 @@ class CloseTabAction(BaseModel):
class ScrollAction(BaseModel):
down: bool # True to scroll down, False to scroll up
amount: int | None = None # Number of pixels to scroll. If None, defaults to one page height
num_pages: float | None = (
None # Number of pages to scroll (0.5 = half page, 1.0 = one page, etc.). If None, defaults to one page
)
class SendKeysAction(BaseModel):

View File

@@ -18,12 +18,12 @@ if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set')
"""
Example: Using the 'Scroll' action with custom amounts.
Example: Using the 'Scroll' action with custom page amounts.
This script demonstrates how the agent can navigate to a webpage and scroll by specific amounts.
This script demonstrates how the agent can navigate to a webpage and scroll by specific page amounts.
The scroll action now supports:
- Scrolling by a specific number of pixels using the 'amount' parameter
- Scrolling by one page height if no amount is specified (default behavior)
- Scrolling by a specific number of pages using the 'num_pages' parameter (e.g., 0.5 for half page, 1.0 for one page, 2.0 for two pages)
- Scrolling by one page height if no num_pages is specified (default behavior)
- Scrolling up or down using the 'down' parameter
"""
@@ -33,7 +33,7 @@ browser_profile = BrowserProfile(headless=False)
browser_session = BrowserSession(browser_profile=browser_profile)
agent = Agent(
task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 100 pixels - then scroll down by 100 pixels - then scroll down by 10000 pixels.",
task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 0.5 pages - then scroll down by 0.25 pages - then scroll down by 2 pages.",
# Alternative task to demonstrate text-based scrolling:
# task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll to the string 'The vast majority of computer'",
llm=llm,

View File

@@ -144,8 +144,8 @@ class TestControllerIntegration:
assert result.extracted_content is not None
assert 'Scrolled down' in result.extracted_content
# Test 2: Custom scroll amount up
scroll_up_action = {'scroll': ScrollAction(down=False, amount=250)}
# Test 2: Custom scroll num_pages up (quarter page)
scroll_up_action = {'scroll': ScrollAction(down=False, num_pages=0.25)}
class ScrollUpActionModel(ActionModel):
scroll: ScrollAction | None = None
@@ -157,10 +157,10 @@ class TestControllerIntegration:
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Scrolled up' in result.extracted_content
assert '250 pixels' in result.extracted_content
assert '0.25 pages' in result.extracted_content
# Test 3: Custom scroll amount down
scroll_custom_action = {'scroll': ScrollAction(down=True, amount=500)}
# Test 3: Custom scroll num_pages down (half page)
scroll_custom_action = {'scroll': ScrollAction(down=True, num_pages=0.5)}
class ScrollCustomActionModel(ActionModel):
scroll: ScrollAction | None = None
@@ -172,7 +172,7 @@ class TestControllerIntegration:
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Scrolled down' in result.extracted_content
assert '500 pixels' in result.extracted_content
assert '0.5 pages' in result.extracted_content
async def test_registry_actions(self, controller, browser_session):
"""Test that the registry contains the expected default actions."""