import asyncio import logging from main_content_extractor import MainContentExtractor from browser_use.agent.views import ActionModel, ActionResult from browser_use.browser.service import Browser, BrowserConfig from browser_use.controller.registry.service import Registry from browser_use.controller.views import ( ClickElementAction, DoneAction, ExtractPageContentAction, GoToUrlAction, InputTextAction, OpenTabAction, ScrollAction, SearchGoogleAction, SwitchTabAction, ) from browser_use.utils import time_execution_async, time_execution_sync logger = logging.getLogger(__name__) class Controller: def __init__( self, browser_config: BrowserConfig = BrowserConfig(), ): self.browser = Browser(config=browser_config) self.wait_between_actions = browser_config.wait_between_actions self.registry = Registry() self._register_default_actions() def _register_default_actions(self): """Register all default browser actions""" # Basic Navigation Actions @self.registry.action( 'Search Google in the current tab', param_model=SearchGoogleAction, requires_browser=True, ) async def search_google(params: SearchGoogleAction, browser: Browser): page = await browser.get_current_page() await page.goto(f'https://www.google.com/search?q={params.query}') await page.wait_for_load_state() @self.registry.action( 'Navigate to URL in the current tab', param_model=GoToUrlAction, requires_browser=True ) async def go_to_url(params: GoToUrlAction, browser: Browser): page = await browser.get_current_page() await page.goto(params.url) await page.wait_for_load_state() @self.registry.action('Go back', requires_browser=True) async def go_back(browser: Browser): page = await browser.get_current_page() await page.go_back() await page.wait_for_load_state() # Element Interaction Actions @self.registry.action( 'Click element', param_model=ClickElementAction, requires_browser=True ) async def click_element(params: ClickElementAction, browser: Browser): session = await browser.get_session() state = session.cached_state if params.index not in state.selector_map: raise Exception( f'Element with index {params.index} does not exist - retry or use alternative actions' ) element_node = state.selector_map[params.index] initial_pages = len(session.context.pages) msg = None try: await browser._click_element_node(element_node) msg = f'🖱️ Clicked element {params.index}: {element_node.xpath}' if len(session.context.pages) > initial_pages: await browser.switch_to_tab(-1) return ActionResult(extracted_content=msg, include_in_memory=True) except Exception as e: logger.warning( f'Element no longer available with index {params.index} - most likely the page changed' ) return ActionResult(error=str(e)) @self.registry.action('Input text', param_model=InputTextAction, requires_browser=True) async def input_text(params: InputTextAction, browser: Browser): session = await browser.get_session() state = session.cached_state if params.index not in state.selector_map: raise Exception( f'Element index {params.index} does not exist - retry or use alternative actions' ) element_node = state.selector_map[params.index] await browser._input_text_element_node(element_node, params.text) msg = f'⌨️ Input "{params.text}" into {params.index}: {element_node.xpath}' return ActionResult(extracted_content=msg, include_in_memory=True) # Tab Management Actions @self.registry.action('Switch tab', param_model=SwitchTabAction, requires_browser=True) async def switch_tab(params: SwitchTabAction, browser: Browser): await browser.switch_to_tab(params.page_id) # Wait for tab to be ready page = await browser.get_current_page() await page.wait_for_load_state() @self.registry.action( 'Open url in new tab', param_model=OpenTabAction, requires_browser=True ) async def open_tab(params: OpenTabAction, browser: Browser): await browser.create_new_tab(params.url) # Content Actions @self.registry.action( 'Extract page content to get the text or markdown ', param_model=ExtractPageContentAction, requires_browser=True, ) async def extract_content(params: ExtractPageContentAction, browser: Browser): page = await browser.get_current_page() content = MainContentExtractor.extract( # type: ignore html=await page.content(), output_format=params.value, ) return ActionResult(extracted_content=content) @self.registry.action('Complete task', param_model=DoneAction, requires_browser=True) async def done(params: DoneAction, browser: Browser): session = await browser.get_session() state = session.cached_state return ActionResult(is_done=True, extracted_content=params.text) @self.registry.action( 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page', param_model=ScrollAction, requires_browser=True, ) async def scroll_down(params: ScrollAction, browser: Browser): page = await browser.get_current_page() if params.amount is not None: await page.evaluate(f'window.scrollBy(0, {params.amount});') else: await page.keyboard.press('PageDown') amount = params.amount if params.amount is not None else 'one page' return ActionResult( extracted_content=f'Scrolled down the page by {amount} pixels', include_in_memory=True, ) # scroll up @self.registry.action( 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page', param_model=ScrollAction, requires_browser=True, ) async def scroll_up(params: ScrollAction, browser: Browser): page = await browser.get_current_page() if params.amount is not None: await page.evaluate(f'window.scrollBy(0, -{params.amount});') else: await page.keyboard.press('PageUp') amount = params.amount if params.amount is not None else 'one page' return ActionResult( extracted_content=f'Scrolled up the page by {amount} pixels', include_in_memory=True, ) def action(self, description: str, **kwargs): """Decorator for registering custom actions @param description: Describe the LLM what the function does (better description == better function calling) """ return self.registry.action(description, **kwargs) @time_execution_async('--multi-act') async def multi_act(self, actions: list[ActionModel]) -> list[ActionResult]: """Execute multiple actions""" results = [] changed = False await self.browser.remove_highlights() session = await self.browser.get_session() cached_selector_map = session.cached_state.selector_map cached_att_hashes = set(e.hash.attributes_hash for e in cached_selector_map.values()) cached_path_hashes = set(e.hash.branch_path_hash for e in cached_selector_map.values()) for i, action in enumerate(actions): if changed and action.get_index() is not None: # next action requires index but there are new elements on the page break results.append(await self.act(action)) logger.debug(f'Executed action {i + 1} / {len(actions)}') if results[-1].is_done or results[-1].error or i == len(actions) - 1: break await asyncio.sleep(self.wait_between_actions) # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page) new_state = await self.browser.get_state() new_att_hashes = set(e.hash.attributes_hash for e in new_state.selector_map.values()) if not new_att_hashes.issubset(cached_att_hashes): logger.debug(f'Attributes changed - stopping after {i + 1} actions') changed = True new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values()) if not new_path_hashes.issubset(cached_path_hashes): logger.debug(f'Branch path changed - stopping after {i + 1} actions') changed = True return results @time_execution_sync('--act') async def act(self, action: ActionModel) -> ActionResult: """Execute an action""" try: for action_name, params in action.model_dump(exclude_unset=True).items(): if params is not None: # remove highlights result = await self.registry.execute_action( action_name, params, browser=self.browser ) if isinstance(result, str): return ActionResult(extracted_content=result) elif isinstance(result, ActionResult): return result elif result is None: return ActionResult() else: raise ValueError(f'Invalid action result type: {type(result)} of {result}') return ActionResult() except Exception as e: raise e