Files
browser-use/browser_use/skill_cli/commands/browser.py

774 lines
23 KiB
Python

"""Browser control commands."""
import asyncio
import base64
import logging
from pathlib import Path
from typing import Any
from browser_use.skill_cli.sessions import SessionInfo
logger = logging.getLogger(__name__)
COMMANDS = {
'open',
'click',
'type',
'input',
'scroll',
'back',
'screenshot',
'state',
'tab',
'keys',
'select',
'upload',
'eval',
'extract',
'cookies',
'wait',
'hover',
'dblclick',
'rightclick',
'get',
}
async def _execute_js(session: SessionInfo, js: str) -> Any:
"""Execute JavaScript in the browser via CDP."""
bs = session.browser_session
# Get or create a CDP session for the focused target
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if not cdp_session:
raise RuntimeError('No active browser session')
result = await cdp_session.cdp_client.send.Runtime.evaluate(
params={'expression': js, 'returnByValue': True},
session_id=cdp_session.session_id,
)
return result.get('result', {}).get('value')
async def _get_element_center(session: SessionInfo, node: Any) -> tuple[float, float] | None:
"""Get the center coordinates of an element."""
bs = session.browser_session
try:
cdp_session = await bs.cdp_client_for_node(node)
session_id = cdp_session.session_id
backend_node_id = node.backend_node_id
# Scroll element into view first
try:
await cdp_session.cdp_client.send.DOM.scrollIntoViewIfNeeded(
params={'backendNodeId': backend_node_id}, session_id=session_id
)
await asyncio.sleep(0.05)
except Exception:
pass
# Get element coordinates
element_rect = await bs.get_element_coordinates(backend_node_id, cdp_session)
if element_rect:
center_x = element_rect.x + element_rect.width / 2
center_y = element_rect.y + element_rect.height / 2
return center_x, center_y
return None
except Exception as e:
logger.error(f'Failed to get element center: {e}')
return None
async def handle(action: str, session: SessionInfo, params: dict[str, Any]) -> Any:
"""Handle browser control command."""
bs = session.browser_session
actions = session.actions
if actions is None:
return {'error': 'ActionHandler not initialized'}
if action == 'open':
url = params['url']
if not url.startswith(('http://', 'https://', 'file://')):
url = 'https://' + url
await actions.navigate(url)
result: dict[str, Any] = {'url': url}
if bs.browser_profile.use_cloud and bs.cdp_url:
from urllib.parse import quote
result['live_url'] = f'https://live.browser-use.com/?wss={quote(bs.cdp_url, safe="")}'
return result
elif action == 'click':
args = params.get('args', [])
if len(args) == 2:
x, y = args
await actions.click_coordinate(x, y)
return {'clicked_coordinate': {'x': x, 'y': y}}
elif len(args) == 1:
index = args[0]
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
await actions.click_element(node)
return {'clicked': index}
else:
return {'error': 'Usage: click <index> or click <x> <y>'}
elif action == 'type':
text = params['text']
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if not cdp_session:
return {'error': 'No active browser session'}
await cdp_session.cdp_client.send.Input.insertText(
params={'text': text},
session_id=cdp_session.session_id,
)
return {'typed': text}
elif action == 'input':
index = params['index']
text = params['text']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
await actions.click_element(node)
await actions.type_text(node, text)
return {'input': text, 'element': index}
elif action == 'scroll':
direction = params.get('direction', 'down')
amount = params.get('amount', 500)
await actions.scroll(direction, amount)
return {'scrolled': direction, 'amount': amount}
elif action == 'back':
await actions.go_back()
return {'back': True}
elif action == 'screenshot':
data = await bs.take_screenshot(full_page=params.get('full', False))
if params.get('path'):
path = Path(params['path'])
path.write_bytes(data)
return {'saved': str(path), 'size': len(data)}
# Return base64 encoded
return {'screenshot': base64.b64encode(data).decode(), 'size': len(data)}
elif action == 'state':
state = await actions.get_state()
assert state.dom_state is not None
state_text = state.dom_state.llm_representation()
# Prepend viewport dimensions
if state.page_info:
pi = state.page_info
viewport_text = f'viewport: {pi.viewport_width}x{pi.viewport_height}\n'
viewport_text += f'page: {pi.page_width}x{pi.page_height}\n'
viewport_text += f'scroll: ({pi.scroll_x}, {pi.scroll_y})\n'
state_text = viewport_text + state_text
# Append auto-dismissed popup messages
if bs._closed_popup_messages:
state_text += '\nAuto-closed dialogs:\n'
for msg in bs._closed_popup_messages:
state_text += f' {msg}\n'
bs._closed_popup_messages.clear()
return {'_raw_text': state_text}
elif action == 'tab':
tab_command = params.get('tab_command')
if tab_command == 'list':
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
lines = ['TAB URL']
for i, t in enumerate(page_targets):
lines.append(f'{i:<4} {t.url}')
return {'_raw_text': '\n'.join(lines)}
elif tab_command == 'new':
url = params.get('url', 'about:blank')
target_id = await bs._cdp_create_new_page(url, background=True)
bs.agent_focus_target_id = target_id
return {'created': target_id[:8], 'url': url}
elif tab_command == 'switch':
tab_index = params['tab']
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
if tab_index < 0 or tab_index >= len(page_targets):
return {'error': f'Invalid tab index {tab_index}. Available: 0-{len(page_targets) - 1}'}
bs.agent_focus_target_id = page_targets[tab_index].target_id
return {'switched': tab_index}
elif tab_command == 'close':
tab_indices = params.get('tabs', [])
page_targets = bs.session_manager.get_all_page_targets() if bs.session_manager else []
async def _close_target(tid: str) -> None:
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if cdp_session:
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': tid})
if not tab_indices:
# Use caller's logical focus, not Chrome's global focus
target_id = bs.agent_focus_target_id
if not target_id:
target_id = bs.session_manager.get_focused_target().target_id if bs.session_manager else None
if not target_id:
return {'error': 'No focused tab to close'}
await _close_target(target_id)
return {'closed': [0]}
closed = []
errors = []
for idx in sorted(tab_indices, reverse=True):
if idx < 0 or idx >= len(page_targets):
errors.append(f'Tab {idx} out of range')
continue
try:
await _close_target(page_targets[idx].target_id)
closed.append(idx)
except Exception as e:
errors.append(f'Tab {idx}: {e}')
result: dict[str, Any] = {'closed': closed}
if errors:
result['errors'] = errors
return result
return {'error': 'Invalid tab command. Use: list, new, switch, close'}
elif action == 'keys':
keys = params['keys']
await actions.send_keys(keys)
return {'sent': keys}
elif action == 'select':
index = params['index']
value = params['value']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
await actions.select_dropdown(node, value)
return {'selected': value, 'element': index}
elif action == 'upload':
index = params['index']
file_path = params['path']
p = Path(file_path)
if not p.exists():
return {'error': f'File not found: {file_path}'}
if not p.is_file():
return {'error': f'Not a file: {file_path}'}
if p.stat().st_size == 0:
return {'error': f'File is empty (0 bytes): {file_path}'}
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
file_input_node = bs.find_file_input_near_element(node)
if file_input_node is None:
selector_map = await bs.get_selector_map()
file_input_indices = [idx for idx, el in selector_map.items() if bs.is_file_input(el)]
if file_input_indices:
hint = f' File input(s) found at index: {", ".join(map(str, file_input_indices))}'
else:
hint = ' No file input found on the page.'
return {'error': f'Element {index} is not a file input.{hint}'}
await actions.upload_file(file_input_node, file_path)
return {'uploaded': file_path, 'element': index}
elif action == 'eval':
js = params['js']
# Execute JavaScript via CDP
result = await _execute_js(session, js)
return {'result': result}
elif action == 'extract':
query = params['query']
# This requires LLM integration
# For now, return a placeholder
return {'query': query, 'error': 'extract is not yet implemented'}
elif action == 'hover':
index = params['index']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
coords = await _get_element_center(session, node)
if not coords:
return {'error': 'Could not get element coordinates for hover'}
center_x, center_y = coords
cdp_session = await bs.cdp_client_for_node(node)
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
params={'type': 'mouseMoved', 'x': center_x, 'y': center_y},
session_id=cdp_session.session_id,
)
return {'hovered': index}
elif action == 'dblclick':
index = params['index']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
coords = await _get_element_center(session, node)
if not coords:
return {'error': 'Could not get element coordinates for double-click'}
center_x, center_y = coords
cdp_session = await bs.cdp_client_for_node(node)
session_id = cdp_session.session_id
# Move mouse to element
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
params={'type': 'mouseMoved', 'x': center_x, 'y': center_y},
session_id=session_id,
)
await asyncio.sleep(0.05)
# Double click (clickCount: 2)
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
params={
'type': 'mousePressed',
'x': center_x,
'y': center_y,
'button': 'left',
'clickCount': 2,
},
session_id=session_id,
)
await asyncio.sleep(0.05)
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
params={
'type': 'mouseReleased',
'x': center_x,
'y': center_y,
'button': 'left',
'clickCount': 2,
},
session_id=session_id,
)
return {'double_clicked': index}
elif action == 'rightclick':
index = params['index']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
coords = await _get_element_center(session, node)
if not coords:
return {'error': 'Could not get element coordinates for right-click'}
center_x, center_y = coords
cdp_session = await bs.cdp_client_for_node(node)
session_id = cdp_session.session_id
# Move mouse to element
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
params={'type': 'mouseMoved', 'x': center_x, 'y': center_y},
session_id=session_id,
)
await asyncio.sleep(0.05)
# Right click (button: 'right')
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
params={
'type': 'mousePressed',
'x': center_x,
'y': center_y,
'button': 'right',
'clickCount': 1,
},
session_id=session_id,
)
await asyncio.sleep(0.05)
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
params={
'type': 'mouseReleased',
'x': center_x,
'y': center_y,
'button': 'right',
'clickCount': 1,
},
session_id=session_id,
)
return {'right_clicked': index}
elif action == 'cookies':
cookies_command = params.get('cookies_command')
if cookies_command == 'get':
# Get cookies via direct CDP
cookies = await bs._cdp_get_cookies()
# Convert Cookie objects to dicts
cookie_list: list[dict[str, Any]] = []
for c in cookies:
cookie_dict: dict[str, Any] = {
'name': c.get('name', ''),
'value': c.get('value', ''),
'domain': c.get('domain', ''),
'path': c.get('path', '/'),
'secure': c.get('secure', False),
'httpOnly': c.get('httpOnly', False),
}
if 'sameSite' in c:
cookie_dict['sameSite'] = c.get('sameSite')
if 'expires' in c:
cookie_dict['expires'] = c.get('expires')
cookie_list.append(cookie_dict)
# Filter by URL if provided
url = params.get('url')
if url:
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc
cookie_list = [
c
for c in cookie_list
if domain.endswith(str(c.get('domain', '')).lstrip('.'))
or str(c.get('domain', '')).lstrip('.').endswith(domain)
]
return {'cookies': cookie_list}
elif cookies_command == 'set':
from cdp_use.cdp.network import Cookie
cookie_dict: dict[str, Any] = {
'name': params['name'],
'value': params['value'],
'path': params.get('path', '/'),
'secure': params.get('secure', False),
'httpOnly': params.get('http_only', False),
}
if params.get('domain'):
cookie_dict['domain'] = params['domain']
if params.get('same_site'):
cookie_dict['sameSite'] = params['same_site']
if params.get('expires'):
cookie_dict['expires'] = params['expires']
# If no domain specified, get current URL's domain
if not params.get('domain'):
hostname = await _execute_js(session, 'window.location.hostname')
if hostname:
cookie_dict['domain'] = hostname
try:
cookie_obj = Cookie(**cookie_dict)
await bs._cdp_set_cookies([cookie_obj])
return {'set': params['name'], 'success': True}
except Exception as e:
logger.error(f'Failed to set cookie: {e}')
return {'set': params['name'], 'success': False, 'error': str(e)}
elif cookies_command == 'clear':
url = params.get('url')
if url:
# Clear cookies only for specific URL domain
from urllib.parse import urlparse
cookies = await bs._cdp_get_cookies()
parsed = urlparse(url)
domain = parsed.netloc
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if cdp_session:
for cookie in cookies:
cookie_domain = str(cookie.get('domain', '')).lstrip('.')
if domain.endswith(cookie_domain) or cookie_domain.endswith(domain):
await cdp_session.cdp_client.send.Network.deleteCookies(
params={
'name': cookie.get('name', ''),
'domain': cookie.get('domain'),
'path': cookie.get('path', '/'),
},
session_id=cdp_session.session_id,
)
else:
# Clear all cookies
await bs._cdp_clear_cookies()
return {'cleared': True, 'url': url}
elif cookies_command == 'export':
import json
# Get cookies via direct CDP
cookies = await bs._cdp_get_cookies()
# Convert to list of dicts
cookie_list: list[dict[str, Any]] = []
for c in cookies:
cookie_dict: dict[str, Any] = {
'name': c.get('name', ''),
'value': c.get('value', ''),
'domain': c.get('domain', ''),
'path': c.get('path', '/'),
'secure': c.get('secure', False),
'httpOnly': c.get('httpOnly', False),
}
if 'sameSite' in c:
cookie_dict['sameSite'] = c.get('sameSite')
if 'expires' in c:
cookie_dict['expires'] = c.get('expires')
cookie_list.append(cookie_dict)
# Filter by URL if provided
url = params.get('url')
if url:
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc
cookie_list = [
c
for c in cookie_list
if domain.endswith(str(c.get('domain', '')).lstrip('.'))
or str(c.get('domain', '')).lstrip('.').endswith(domain)
]
file_path = Path(params['file'])
file_path.write_text(json.dumps(cookie_list, indent=2, ensure_ascii=False), encoding='utf-8')
return {'exported': len(cookie_list), 'file': str(file_path)}
elif cookies_command == 'import':
import json
file_path = Path(params['file'])
if not file_path.exists():
return {'error': f'File not found: {file_path}'}
cookies = json.loads(file_path.read_text())
# Get CDP session for bulk cookie setting
cdp_session = await bs.get_or_create_cdp_session(target_id=None, focus=False)
if not cdp_session:
return {'error': 'No active browser session'}
# Build cookie list for bulk set
cookie_list = []
for c in cookies:
cookie_params = {
'name': c['name'],
'value': c['value'],
'domain': c.get('domain'),
'path': c.get('path', '/'),
'secure': c.get('secure', False),
'httpOnly': c.get('httpOnly', False),
}
if c.get('sameSite'):
cookie_params['sameSite'] = c['sameSite']
if c.get('expires'):
cookie_params['expires'] = c['expires']
cookie_list.append(cookie_params)
# Set all cookies in one call
try:
await cdp_session.cdp_client.send.Network.setCookies(
params={'cookies': cookie_list}, # type: ignore[arg-type]
session_id=cdp_session.session_id,
)
return {'imported': len(cookie_list), 'file': str(file_path)}
except Exception as e:
return {'error': f'Failed to import cookies: {e}'}
return {'error': 'Invalid cookies command. Use: get, set, clear, export, import'}
elif action == 'wait':
import json as json_module
wait_command = params.get('wait_command')
if wait_command == 'selector':
timeout_seconds = params.get('timeout', 30000) / 1000.0
state = params.get('state', 'visible')
selector = params['selector']
poll_interval = 0.1
elapsed = 0.0
while elapsed < timeout_seconds:
# Build JS check based on state
if state == 'attached':
js = f'document.querySelector({json_module.dumps(selector)}) !== null'
elif state == 'detached':
js = f'document.querySelector({json_module.dumps(selector)}) === null'
elif state == 'visible':
js = f"""
(function() {{
const el = document.querySelector({json_module.dumps(selector)});
if (!el) return false;
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
return style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0' &&
rect.width > 0 &&
rect.height > 0;
}})()
"""
elif state == 'hidden':
js = f"""
(function() {{
const el = document.querySelector({json_module.dumps(selector)});
if (!el) return true;
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
return style.display === 'none' ||
style.visibility === 'hidden' ||
style.opacity === '0' ||
rect.width === 0 ||
rect.height === 0;
}})()
"""
else:
js = f'document.querySelector({json_module.dumps(selector)}) !== null'
result = await _execute_js(session, js)
if result:
return {'selector': selector, 'found': True}
await asyncio.sleep(poll_interval)
elapsed += poll_interval
return {'selector': selector, 'found': False}
elif wait_command == 'text':
import json as json_module
timeout_seconds = params.get('timeout', 30000) / 1000.0
text = params['text']
poll_interval = 0.1
elapsed = 0.0
while elapsed < timeout_seconds:
js = f"""
(function() {{
const text = {json_module.dumps(text)};
return document.body.innerText.includes(text);
}})()
"""
result = await _execute_js(session, js)
if result:
return {'text': text, 'found': True}
await asyncio.sleep(poll_interval)
elapsed += poll_interval
return {'text': text, 'found': False}
return {'error': 'Invalid wait command. Use: selector, text'}
elif action == 'get':
import json as json_module
get_command = params.get('get_command')
if get_command == 'title':
title = await _execute_js(session, 'document.title')
return {'title': title or ''}
elif get_command == 'html':
selector = params.get('selector')
if selector:
js = f'(function(){{ const el = document.querySelector({json_module.dumps(selector)}); return el ? el.outerHTML : null; }})()'
else:
js = 'document.documentElement.outerHTML'
html = await _execute_js(session, js)
return {'html': html or ''}
elif get_command == 'text':
index = params['index']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
# Use the node's text from our model
text = node.get_all_children_text(max_depth=10) if node else ''
return {'index': index, 'text': text}
elif get_command == 'value':
index = params['index']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
try:
cdp_session = await bs.cdp_client_for_node(node)
resolve_result = await cdp_session.cdp_client.send.DOM.resolveNode(
params={'backendNodeId': node.backend_node_id},
session_id=cdp_session.session_id,
)
object_id = resolve_result['object'].get('objectId') # type: ignore[union-attr]
if object_id:
value_result = await cdp_session.cdp_client.send.Runtime.callFunctionOn(
params={
'objectId': object_id,
'functionDeclaration': 'function() { return this.value; }',
'returnByValue': True,
},
session_id=cdp_session.session_id,
)
value = value_result.get('result', {}).get('value')
return {'index': index, 'value': value or ''}
else:
return {'index': index, 'value': ''}
except Exception as e:
logger.error(f'Failed to get element value: {e}')
return {'index': index, 'value': ''}
elif get_command == 'attributes':
index = params['index']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
# Use the attributes from the node model
attrs = node.attributes or {}
return {'index': index, 'attributes': dict(attrs)}
elif get_command == 'bbox':
index = params['index']
node = await bs.get_element_by_index(index)
if node is None:
return {'error': f'Element index {index} not found - page may have changed'}
try:
cdp_session = await bs.cdp_client_for_node(node)
box_result = await cdp_session.cdp_client.send.DOM.getBoxModel(
params={'backendNodeId': node.backend_node_id},
session_id=cdp_session.session_id,
)
model = box_result['model'] # type: ignore[index]
content = model.get('content', []) # type: ignore[union-attr]
if len(content) >= 8:
# content is [x1, y1, x2, y2, x3, y3, x4, y4] - corners of the quad
x = min(content[0], content[2], content[4], content[6])
y = min(content[1], content[3], content[5], content[7])
width = max(content[0], content[2], content[4], content[6]) - x
height = max(content[1], content[3], content[5], content[7]) - y
return {'index': index, 'bbox': {'x': x, 'y': y, 'width': width, 'height': height}}
else:
return {'index': index, 'bbox': {}}
except Exception as e:
logger.error(f'Failed to get element bbox: {e}')
return {'index': index, 'bbox': {}}
return {'error': 'Invalid get command. Use: title, html, text, value, attributes, bbox'}
raise ValueError(f'Unknown browser action: {action}')