mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
342 lines
10 KiB
Python
342 lines
10 KiB
Python
"""
|
|
OpenAI Computer Use Assistant (CUA) Integration
|
|
|
|
This example demonstrates how to integrate OpenAI's Computer Use Assistant as a fallback
|
|
action when standard browser actions are insufficient to achieve the desired goal.
|
|
The CUA can perform complex computer interactions that might be difficult to achieve
|
|
through regular browser-use actions.
|
|
"""
|
|
|
|
import asyncio
|
|
import base64
|
|
import os
|
|
import sys
|
|
from io import BytesIO
|
|
|
|
from PIL import Image
|
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
from openai import AsyncOpenAI
|
|
from pydantic import BaseModel, Field
|
|
|
|
from browser_use import Agent, ChatOpenAI, Tools
|
|
from browser_use.agent.views import ActionResult
|
|
from browser_use.browser import BrowserSession
|
|
|
|
|
|
class OpenAICUAAction(BaseModel):
|
|
"""Parameters for OpenAI Computer Use Assistant action."""
|
|
|
|
description: str = Field(..., description='Description of your next goal')
|
|
|
|
|
|
async def handle_model_action(browser_session: BrowserSession, action) -> ActionResult:
|
|
"""
|
|
Given a computer action (e.g., click, double_click, scroll, etc.),
|
|
execute the corresponding operation using CDP.
|
|
"""
|
|
action_type = action.type
|
|
ERROR_MSG: str = 'Could not execute the CUA action.'
|
|
|
|
if not browser_session.agent_focus:
|
|
return ActionResult(error='No active browser session')
|
|
|
|
try:
|
|
match action_type:
|
|
case 'click':
|
|
x, y = action.x, action.y
|
|
button = action.button
|
|
print(f"Action: click at ({x}, {y}) with button '{button}'")
|
|
# Not handling things like middle click, etc.
|
|
if button != 'left' and button != 'right':
|
|
button = 'left'
|
|
|
|
# Use CDP to click
|
|
await browser_session.agent_focus.cdp_client.send.Input.dispatchMouseEvent(
|
|
params={
|
|
'type': 'mousePressed',
|
|
'x': x,
|
|
'y': y,
|
|
'button': button,
|
|
'clickCount': 1,
|
|
},
|
|
session_id=browser_session.agent_focus.session_id,
|
|
)
|
|
await browser_session.agent_focus.cdp_client.send.Input.dispatchMouseEvent(
|
|
params={
|
|
'type': 'mouseReleased',
|
|
'x': x,
|
|
'y': y,
|
|
'button': button,
|
|
},
|
|
session_id=browser_session.agent_focus.session_id,
|
|
)
|
|
msg = f'Clicked at ({x}, {y}) with button {button}'
|
|
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)
|
|
|
|
case 'scroll':
|
|
x, y = action.x, action.y
|
|
scroll_x, scroll_y = action.scroll_x, action.scroll_y
|
|
print(f'Action: scroll at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})')
|
|
|
|
# Move mouse to position first
|
|
await browser_session.agent_focus.cdp_client.send.Input.dispatchMouseEvent(
|
|
params={
|
|
'type': 'mouseMoved',
|
|
'x': x,
|
|
'y': y,
|
|
},
|
|
session_id=browser_session.agent_focus.session_id,
|
|
)
|
|
|
|
# Execute scroll using JavaScript
|
|
await browser_session.agent_focus.cdp_client.send.Runtime.evaluate(
|
|
params={
|
|
'expression': f'window.scrollBy({scroll_x}, {scroll_y})',
|
|
},
|
|
session_id=browser_session.agent_focus.session_id,
|
|
)
|
|
msg = f'Scrolled at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})'
|
|
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)
|
|
|
|
case 'keypress':
|
|
keys = action.keys
|
|
for k in keys:
|
|
print(f"Action: keypress '{k}'")
|
|
# A simple mapping for common keys; expand as needed.
|
|
key_code = k
|
|
if k.lower() == 'enter':
|
|
key_code = 'Enter'
|
|
elif k.lower() == 'space':
|
|
key_code = 'Space'
|
|
|
|
# Use CDP to send key
|
|
await browser_session.agent_focus.cdp_client.send.Input.dispatchKeyEvent(
|
|
params={
|
|
'type': 'keyDown',
|
|
'key': key_code,
|
|
},
|
|
session_id=browser_session.agent_focus.session_id,
|
|
)
|
|
await browser_session.agent_focus.cdp_client.send.Input.dispatchKeyEvent(
|
|
params={
|
|
'type': 'keyUp',
|
|
'key': key_code,
|
|
},
|
|
session_id=browser_session.agent_focus.session_id,
|
|
)
|
|
msg = f'Pressed keys: {keys}'
|
|
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)
|
|
|
|
case 'type':
|
|
text = action.text
|
|
print(f'Action: type text: {text}')
|
|
|
|
# Type text character by character
|
|
for char in text:
|
|
await browser_session.agent_focus.cdp_client.send.Input.dispatchKeyEvent(
|
|
params={
|
|
'type': 'char',
|
|
'text': char,
|
|
},
|
|
session_id=browser_session.agent_focus.session_id,
|
|
)
|
|
msg = f'Typed text: {text}'
|
|
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)
|
|
|
|
case 'wait':
|
|
print('Action: wait')
|
|
await asyncio.sleep(2)
|
|
msg = 'Waited for 2 seconds'
|
|
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)
|
|
|
|
case 'screenshot':
|
|
# Nothing to do as screenshot is taken at each turn
|
|
print('Action: screenshot')
|
|
return ActionResult(error=ERROR_MSG)
|
|
# Handle other actions here
|
|
|
|
case _:
|
|
print(f'Unrecognized action: {action}')
|
|
return ActionResult(error=ERROR_MSG)
|
|
|
|
except Exception as e:
|
|
print(f'Error handling action {action}: {e}')
|
|
return ActionResult(error=ERROR_MSG)
|
|
|
|
|
|
tools = Tools()
|
|
|
|
|
|
@tools.registry.action(
|
|
'Use OpenAI Computer Use Assistant (CUA) as a fallback when standard browser actions cannot achieve the desired goal. This action sends a screenshot and description to OpenAI CUA and executes the returned computer use actions.',
|
|
param_model=OpenAICUAAction,
|
|
)
|
|
async def openai_cua_fallback(params: OpenAICUAAction, browser_session: BrowserSession):
|
|
"""
|
|
Fallback action that uses OpenAI's Computer Use Assistant to perform complex
|
|
computer interactions when standard browser actions are insufficient.
|
|
"""
|
|
print(f'🎯 CUA Action Starting - Goal: {params.description}')
|
|
|
|
try:
|
|
# Get browser state summary
|
|
state = await browser_session.get_browser_state_summary()
|
|
page_info = state.page_info
|
|
if not page_info:
|
|
raise Exception('Page info not found - cannot execute CUA action')
|
|
|
|
print(f'📐 Viewport size: {page_info.viewport_width}x{page_info.viewport_height}')
|
|
|
|
screenshot_b64 = state.screenshot
|
|
if not screenshot_b64:
|
|
raise Exception('Screenshot not found - cannot execute CUA action')
|
|
|
|
print(f'📸 Screenshot captured (base64 length: {len(screenshot_b64)} chars)')
|
|
|
|
# Debug: Check screenshot dimensions
|
|
image = Image.open(BytesIO(base64.b64decode(screenshot_b64)))
|
|
print(f'📏 Screenshot actual dimensions: {image.size[0]}x{image.size[1]}')
|
|
|
|
# rescale the screenshot to the viewport size
|
|
image = image.resize((page_info.viewport_width, page_info.viewport_height))
|
|
# Save as PNG to bytes buffer
|
|
buffer = BytesIO()
|
|
image.save(buffer, format='PNG')
|
|
buffer.seek(0)
|
|
# Convert to base64
|
|
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
print(f'📸 Rescaled screenshot to viewport size: {page_info.viewport_width}x{page_info.viewport_height}')
|
|
|
|
client = AsyncOpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
|
print('🔄 Sending request to OpenAI CUA...')
|
|
|
|
prompt = f"""
|
|
You will be given an action to execute and screenshot of the current screen.
|
|
Output one computer_call object that will achieve this goal.
|
|
Goal: {params.description}
|
|
"""
|
|
response = await client.responses.create(
|
|
model='computer-use-preview',
|
|
tools=[
|
|
{
|
|
'type': 'computer_use_preview',
|
|
'display_width': page_info.viewport_width,
|
|
'display_height': page_info.viewport_height,
|
|
'environment': 'browser',
|
|
}
|
|
],
|
|
input=[
|
|
{
|
|
'role': 'user',
|
|
'content': [
|
|
{'type': 'input_text', 'text': prompt},
|
|
{
|
|
'type': 'input_image',
|
|
'detail': 'auto',
|
|
'image_url': f'data:image/png;base64,{screenshot_b64}',
|
|
},
|
|
],
|
|
}
|
|
],
|
|
truncation='auto',
|
|
temperature=0.1,
|
|
)
|
|
|
|
print(f'📥 CUA response received: {response}')
|
|
computer_calls = [item for item in response.output if item.type == 'computer_call']
|
|
computer_call = computer_calls[0] if computer_calls else None
|
|
if not computer_call:
|
|
raise Exception('No computer calls found in CUA response')
|
|
|
|
action = computer_call.action
|
|
print(f'🎬 Executing CUA action: {action.type} - {action}')
|
|
|
|
action_result = await handle_model_action(browser_session, action)
|
|
await asyncio.sleep(0.1)
|
|
|
|
print('✅ CUA action completed successfully')
|
|
return action_result
|
|
|
|
except Exception as e:
|
|
msg = f'Error executing CUA action: {e}'
|
|
print(f'❌ {msg}')
|
|
return ActionResult(error=msg)
|
|
|
|
|
|
async def main():
|
|
# Initialize the language model
|
|
llm = ChatOpenAI(
|
|
model='o4-mini',
|
|
temperature=1.0,
|
|
)
|
|
|
|
# Create browser session
|
|
browser_session = BrowserSession()
|
|
|
|
# Example task that might require CUA fallback
|
|
# This could be a complex interaction that's difficult with standard actions
|
|
task = """
|
|
Go to https://csreis.github.io/tests/cross-site-iframe.html
|
|
Click on "Go cross-site, complex page" using index
|
|
Use the OpenAI CUA fallback to click on "Tree is open..." link.
|
|
"""
|
|
|
|
# Create agent with our custom tools that includes CUA fallback
|
|
agent = Agent(
|
|
task=task,
|
|
llm=llm,
|
|
tools=tools,
|
|
browser_session=browser_session,
|
|
)
|
|
|
|
print('🚀 Starting agent with CUA fallback support...')
|
|
print(f'Task: {task}')
|
|
print('-' * 50)
|
|
|
|
try:
|
|
# Run the agent
|
|
result = await agent.run()
|
|
print(f'\n✅ Task completed! Result: {result}')
|
|
|
|
except Exception as e:
|
|
print(f'\n❌ Error running agent: {e}')
|
|
|
|
finally:
|
|
# Clean up browser session
|
|
await browser_session.kill()
|
|
print('\n🧹 Browser session closed')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Example of different scenarios where CUA might be useful
|
|
|
|
print('🔧 OpenAI Computer Use Assistant (CUA) Integration Example')
|
|
print('=' * 60)
|
|
print()
|
|
print("This example shows how to integrate OpenAI's CUA as a fallback action")
|
|
print('when standard browser-use actions cannot achieve the desired goal.')
|
|
print()
|
|
print('CUA is particularly useful for:')
|
|
print('• Complex mouse interactions (drag & drop, precise clicking)')
|
|
print('• Keyboard shortcuts and key combinations')
|
|
print('• Actions that require pixel-perfect precision')
|
|
print("• Custom UI elements that don't respond to standard actions")
|
|
print()
|
|
print('Make sure you have OPENAI_API_KEY set in your environment!')
|
|
print()
|
|
|
|
# Check if OpenAI API key is available
|
|
if not os.getenv('OPENAI_API_KEY'):
|
|
print('❌ Error: OPENAI_API_KEY environment variable not set')
|
|
print('Please set your OpenAI API key to use CUA integration')
|
|
sys.exit(1)
|
|
|
|
# Run the example
|
|
asyncio.run(main())
|