Files
browser-use/tests/ci/evaluate_tasks.py

114 lines
3.3 KiB
Python

"""
Runs all agent tasks in parallel (up to 10 at a time) and prints a big score summary.
Does not fail on partial failures (always exits 0).
"""
import asyncio
import glob
import os
import sys
import aiofiles
import yaml
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.profile import BrowserProfile
from browser_use.browser.session import BrowserSession
# --- CONFIG ---
MAX_PARALLEL = 10
TASK_DIR = sys.argv[1] if len(sys.argv) > 1 else os.path.join(os.path.dirname(__file__), '../agent_tasks')
TASK_FILES = glob.glob(os.path.join(TASK_DIR, '*.yaml'))
class JudgeResponse(BaseModel):
success: bool
explanation: str
async def run_task(task_file, semaphore):
async with semaphore:
async with aiofiles.open(task_file, 'r') as f:
content = await f.read()
task_data = yaml.safe_load(content)
task = task_data['task']
judge_context = task_data.get('judge_context', ['The agent must solve the task'])
max_steps = task_data.get('max_steps', 15)
agent_llm = ChatOpenAI(model='gpt-4.1-mini')
judge_llm = ChatOpenAI(model='gpt-4.1-mini')
shared_profile = BrowserProfile(
headless=True,
user_data_dir=None, # use dedicated tmp user_data_dir per session
)
session = BrowserSession(browser_profile=shared_profile)
agent = Agent(task=task, llm=agent_llm, browser_session=session)
history: AgentHistoryList = await agent.run(max_steps=max_steps)
agent_output = history.final_result() or ''
criteria = chr(10) + '- '.join(judge_context)
judge_prompt = f"""
You are a evaluator of a browser agent task inside a ci/cd pipeline. Here was the agent's task:
{task}
Here is the agent's output:
{agent_output}
Criteria for success:
- {criteria}
Reply in JSON with keys: success (true/false), explanation (string).
"""
structured_llm = judge_llm.with_structured_output(JudgeResponse)
judge_response = await structured_llm.ainvoke(judge_prompt)
return {'file': os.path.basename(task_file), 'success': judge_response.success, 'explanation': judge_response.explanation}
async def main():
semaphore = asyncio.Semaphore(MAX_PARALLEL)
print(TASK_FILES)
tasks = [run_task(task_file, semaphore) for task_file in TASK_FILES]
results = await asyncio.gather(*tasks)
passed = sum(1 for r in results if r['success'])
total = len(results)
print('\n' + '=' * 60)
print(f'{"RESULTS":^60}\n')
# Prepare table data
headers = ['Task', 'Success', 'Reason']
rows = []
for r in results:
status = '' if r['success'] else ''
rows.append([r['file'], status, r['explanation']])
# Calculate column widths
col_widths = [max(len(str(row[i])) for row in ([headers] + rows)) for i in range(3)]
# Print header
header_row = ' | '.join(headers[i].ljust(col_widths[i]) for i in range(3))
print(header_row)
print('-+-'.join('-' * w for w in col_widths))
# Print rows
for row in rows:
print(' | '.join(str(row[i]).ljust(col_widths[i]) for i in range(3)))
print('\n' + '=' * 60)
print(f'\n{"SCORE":^60}')
print(f'\n{"=" * 60}\n')
print(f'\n{"*" * 10} {passed}/{total} PASSED {"*" * 10}\n')
print('=' * 60 + '\n')
return passed, total
if __name__ == '__main__':
passed, total = asyncio.run(main())
print(f'PASSED={passed}')
print(f'TOTAL={total}')