mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
708 lines
27 KiB
Python
708 lines
27 KiB
Python
# ==============================================================================================================
|
|
# Documentation for this evaluation file.
|
|
# The import
|
|
|
|
|
|
# Here is the command to run the evaluation:
|
|
# python eval/service.py --parallel_runs 5 --parallel_evaluations 5 --max-steps 25 --start 0 --end 100
|
|
# options:
|
|
# --parallel_runs: Number of parallel tasks to run
|
|
# --max-steps: Maximum steps per task
|
|
# --start: Start index
|
|
# --end: End index (exclusive)
|
|
# --headless: Run in headless mode
|
|
|
|
# Here is the command to run the evaluation only:
|
|
# python eval/service.py --evaluate-only
|
|
# options:
|
|
# --parallel_evaluations: Number of parallel evaluations to run
|
|
|
|
# To run a new evaluation, you need to first clear the saved_trajectories folder.
|
|
# rm -rf saved_trajectories
|
|
# Otherwise, the evaluation will continue on from the last saved trajectory.
|
|
# ==============================================================================================================
|
|
|
|
|
|
# ==============================================================================================================
|
|
# This is the LLM as a judge evaluation system from the OSU-NLP Group paper
|
|
# Any adaptiations made should be explicitly stated here:
|
|
# Adaptations:
|
|
# We are using our langchain wrapper for the OpenAI API
|
|
# This means we changed model.generate to model.invoke. The behavior of the model should be identical.
|
|
# Added a Online_Mind2Web_eval_with_retry wrapper with retry logic in case of API rate limiting or other issues.
|
|
|
|
|
|
# @article{xue2025illusionprogressassessingcurrent,
|
|
# title={An Illusion of Progress? Assessing the Current State of Web Agents},
|
|
# author={Tianci Xue and Weijian Qi and Tianneng Shi and Chan Hee Song and Boyu Gou and Dawn Song and Huan Sun and Yu Su},
|
|
# year={2025},
|
|
# eprint={2504.01382},
|
|
# archivePrefix={arXiv},
|
|
# primaryClass={cs.AI},
|
|
# url={https://arxiv.org/abs/2504.01382},
|
|
# }
|
|
|
|
# @inproceedings{deng2023mind2web,
|
|
# author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
|
|
# booktitle = {Advances in Neural Information Processing Systems},
|
|
# editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
|
|
# pages = {28091--28114},
|
|
# publisher = {Curran Associates, Inc.},
|
|
# title = {Mind2Web: Towards a Generalist Agent for the Web},
|
|
# url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf},
|
|
# volume = {36},
|
|
# year = {2023}
|
|
# }
|
|
# ==============================================================================================================
|
|
import asyncio
|
|
import base64
|
|
import io
|
|
import logging
|
|
import re
|
|
|
|
from PIL import Image
|
|
|
|
MAX_IMAGE = 5
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def encode_image(image):
|
|
"""Convert a PIL image to base64 string."""
|
|
if image.mode == 'RGBA':
|
|
image = image.convert('RGB')
|
|
buffered = io.BytesIO()
|
|
image.save(buffered, format='JPEG')
|
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
|
|
|
|
async def identify_key_points(task, model):
|
|
system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description.
|
|
|
|
**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal.
|
|
|
|
**Instructions**:
|
|
1. Read the task description carefully.
|
|
2. Identify and extract **key points** directly stated in the task description.
|
|
- A **key point** is a critical element, condition, or step explicitly mentioned in the task description.
|
|
- Do not infer or add any unstated elements.
|
|
- Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest").
|
|
|
|
**Respond with**:
|
|
- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details."""
|
|
prompt = """Task: {task}"""
|
|
text = prompt.format(task=task)
|
|
messages = [
|
|
{'role': 'system', 'content': system_msg},
|
|
{
|
|
'role': 'user',
|
|
'content': [{'type': 'text', 'text': text}],
|
|
},
|
|
]
|
|
response = await asyncio.to_thread(model.invoke, messages)
|
|
return response.content
|
|
|
|
|
|
async def judge_image(task, image_path, key_points, model):
|
|
system_msg = """You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task.
|
|
|
|
**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score.
|
|
|
|
**Instructions**:
|
|
1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features.
|
|
|
|
2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion:
|
|
- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions.
|
|
- Does the image show actions, progress indicators, or critical information directly related to completing the task?
|
|
- Is this information indispensable for understanding or ensuring task success?
|
|
- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright.
|
|
|
|
3. Provide your response in the following format:
|
|
- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof.
|
|
- **Score**: Assign a score based on the reasoning, using the following scale:
|
|
- **1**: The image does not contain any necessary steps or relevant information.
|
|
- **2**: The image contains minimal or ambiguous information, unlikely to be essential.
|
|
- **3**: The image includes some relevant steps or hints but lacks clarity or completeness.
|
|
- **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive.
|
|
- **5**: The image clearly displays necessary steps or evidence crucial for completing the task.
|
|
|
|
Respond with:
|
|
1. **Reasoning**: [Your explanation]
|
|
2. **Score**: [1-5]"""
|
|
|
|
jpg_base64_str = encode_image(Image.open(image_path))
|
|
|
|
prompt = """**Task**: {task}
|
|
|
|
**Key Points for Task Completion**: {key_points}
|
|
|
|
The snapshot of the web page is shown in the image."""
|
|
text = prompt.format(task=task, key_points=key_points)
|
|
|
|
messages = [
|
|
{'role': 'system', 'content': system_msg},
|
|
{
|
|
'role': 'user',
|
|
'content': [
|
|
{'type': 'text', 'text': text},
|
|
{
|
|
'type': 'image_url',
|
|
'image_url': {'url': f'data:image/jpeg;base64,{jpg_base64_str}', 'detail': 'high'},
|
|
},
|
|
],
|
|
},
|
|
]
|
|
response = await asyncio.to_thread(model.invoke, messages)
|
|
return response.content
|
|
|
|
|
|
async def Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold):
|
|
system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements.
|
|
|
|
Your response must strictly follow the following evaluation criteria!
|
|
*Important Evaluation Criteria*:
|
|
1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful.
|
|
2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function).
|
|
3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements!
|
|
4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow.
|
|
Examples of Failure Cases:
|
|
- If the requirement is less than $50, but the applied filter is less than $25, it is a failure.
|
|
- If the requirement is $1500-$2500, but the applied filter is $2000-$2500, it is a failure.
|
|
- If the requirement is $25-$200, but the applied filter is $0-$200, it is a failure.
|
|
- If the required years are 2004-2012, but the filter applied is 2001-2012, it is a failure.
|
|
- If the required years are before 2015, but the applied filter is 2000-2014, it is a failure.
|
|
- If the task requires exactly 2 beds, but the filter applied is 2+ beds, it is a failure.
|
|
5: Some tasks require a submission action or a display of results to be considered successful.
|
|
6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful.
|
|
7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements (e.g., the cheapest or lowest price), the task is still considered successful.
|
|
|
|
*IMPORTANT*
|
|
Format your response into two lines as shown below:
|
|
|
|
Thoughts: <your thoughts and reasoning process based on double-checking each key points and the evaluation criteria>
|
|
Status: "success" or "failure"
|
|
"""
|
|
prompt = """User Task: {task}
|
|
|
|
Key Points: {key_points}
|
|
|
|
Action History:
|
|
{last_actions}
|
|
|
|
The potentially important snapshots of the webpage in the agent's trajectory and their reasons:
|
|
{thoughts}"""
|
|
|
|
key_points = await identify_key_points(task, model)
|
|
key_points = key_points.replace('\n\n', '\n')
|
|
|
|
try:
|
|
key_points = key_points.split('**Key Points**:')[1]
|
|
key_points = '\n'.join(line.lstrip() for line in key_points.splitlines())
|
|
except IndexError:
|
|
key_points = key_points.split('Key Points:')[-1]
|
|
key_points = '\n'.join(line.lstrip() for line in key_points.splitlines())
|
|
|
|
tasks = [judge_image(task, image_path, key_points, model) for image_path in images_path]
|
|
image_responses = await asyncio.gather(*tasks)
|
|
|
|
whole_content_img = []
|
|
whole_thoughts = []
|
|
record = []
|
|
pattern = r'[1-5]'
|
|
for response, image_path in zip(image_responses, images_path):
|
|
try:
|
|
score_text = response.split('Score')[1]
|
|
thought = response.split('**Reasoning**:')[-1].strip().lstrip('\n').split('\n\n')[0].replace('\n', ' ')
|
|
score = re.findall(pattern, score_text)[0]
|
|
record.append({'Response': response, 'Score': int(score)})
|
|
except Exception as e:
|
|
logger.error(f'Error processing response: {e}')
|
|
score = 0
|
|
record.append({'Response': response, 'Score': 0})
|
|
|
|
if int(score) >= score_threshold:
|
|
jpg_base64_str = encode_image(Image.open(image_path))
|
|
whole_content_img.append(
|
|
{'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{jpg_base64_str}', 'detail': 'high'}}
|
|
)
|
|
if thought != '':
|
|
whole_thoughts.append(thought)
|
|
|
|
whole_content_img = whole_content_img[:MAX_IMAGE]
|
|
whole_thoughts = whole_thoughts[:MAX_IMAGE]
|
|
if len(whole_content_img) == 0:
|
|
prompt = """User Task: {task}
|
|
|
|
Key Points: {key_points}
|
|
|
|
Action History:
|
|
{last_actions}"""
|
|
text = prompt.format(
|
|
task=task,
|
|
last_actions='\n'.join(f'{i + 1}. {action}' for i, action in enumerate(last_actions)),
|
|
key_points=key_points,
|
|
thoughts='\n'.join(f'{i + 1}. {thought}' for i, thought in enumerate(whole_thoughts)),
|
|
)
|
|
|
|
messages = [
|
|
{'role': 'system', 'content': system_msg},
|
|
{'role': 'user', 'content': [{'type': 'text', 'text': text}] + whole_content_img},
|
|
]
|
|
return messages, text, system_msg, record, key_points
|
|
|
|
|
|
async def Online_Mind2Web_eval_with_retry(task, last_actions, images_path, model, score_threshold, max_retries=3):
|
|
"""
|
|
Wrapper for Online_Mind2Web_eval with retry logic.
|
|
|
|
Args:
|
|
task: The task description
|
|
last_actions: list of actions taken
|
|
images_path: list of image paths
|
|
model: The model to use for evaluation
|
|
score_threshold: Score threshold for image filtering
|
|
max_retries: Maximum number of retry attempts
|
|
|
|
Returns:
|
|
Tuple of (messages, text, system_msg, record, key_points) or None if all retries fail
|
|
"""
|
|
for attempt in range(max_retries):
|
|
try:
|
|
return await Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold)
|
|
except Exception as e:
|
|
if attempt == max_retries - 1: # Last attempt
|
|
logger.error(f'Failed to evaluate after {max_retries} attempts. Error: {str(e)}')
|
|
raise
|
|
logger.warning(f'Attempt {attempt + 1} failed. Retrying... Error: {str(e)}')
|
|
await asyncio.sleep(2**attempt) # Exponential backoff
|
|
|
|
|
|
# ==============================================================================================================
|
|
|
|
|
|
# ==============================================================================================================
|
|
# A service for evaluating the performance of the agent
|
|
# ==============================================================================================================
|
|
import argparse
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, Optional
|
|
|
|
from dotenv import load_dotenv
|
|
from langchain_openai import ChatOpenAI
|
|
|
|
from browser_use import Agent, Browser, BrowserConfig
|
|
|
|
|
|
class Task:
|
|
def __init__(self, task_id, confirmed_task, website, reference_length, level):
|
|
self.task_id = task_id
|
|
self.confirmed_task = confirmed_task
|
|
self.website = website
|
|
self.reference_length = reference_length
|
|
self.level = level
|
|
|
|
def __str__(self):
|
|
return f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level})'
|
|
|
|
def __repr__(self):
|
|
return self.__str__()
|
|
|
|
|
|
class TaskTracker:
|
|
def __init__(self, task_id: str, task_text: str):
|
|
self.task_id = task_id
|
|
self.task_text = task_text
|
|
self.result_folder = Path(f'saved_trajectories/{task_id}')
|
|
self.trajectory_folder = self.result_folder / 'trajectory'
|
|
self.step_results = []
|
|
self.step_counter = 0
|
|
self.screenshots = []
|
|
self.setup_folders()
|
|
|
|
def setup_folders(self):
|
|
"""Create the necessary folder structure"""
|
|
self.result_folder.mkdir(parents=True, exist_ok=True)
|
|
self.trajectory_folder.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def on_step_start(self, agent):
|
|
"""Record information at the start of a step"""
|
|
self.current_step = {'step_number': self.step_counter, 'start_time': datetime.now().isoformat(), 'actions': []}
|
|
|
|
async def on_step_end(self, agent):
|
|
"""Record information at the end of a step"""
|
|
# Take screenshot
|
|
browser_context = agent.browser_context
|
|
screenshot_b64 = await browser_context.take_screenshot()
|
|
screenshot_path = self.trajectory_folder / f'step_{self.step_counter}.png'
|
|
|
|
# Save screenshot to file
|
|
with open(screenshot_path, 'wb') as f:
|
|
f.write(base64.b64decode(screenshot_b64))
|
|
|
|
# Save screenshot path
|
|
self.screenshots.append(str(screenshot_path))
|
|
|
|
# Record action and result
|
|
if agent.state.last_result:
|
|
for result in agent.state.last_result:
|
|
self.current_step['actions'].append(
|
|
{
|
|
'content': result.extracted_content,
|
|
'error': result.error,
|
|
'is_done': result.is_done,
|
|
'success': result.success,
|
|
}
|
|
)
|
|
|
|
# Record end time
|
|
self.current_step['end_time'] = datetime.now().isoformat()
|
|
self.current_step['screenshot_path'] = str(screenshot_path)
|
|
|
|
# Add to step results
|
|
self.step_results.append(self.current_step)
|
|
self.step_counter += 1
|
|
|
|
# Save intermediate results
|
|
self.save_results() # Save progress after each step
|
|
|
|
def save_results(self):
|
|
"""Save the consolidated results"""
|
|
# Create the final result object
|
|
|
|
formatted_result = {
|
|
'task_id': self.task_id,
|
|
'task': self.task_text,
|
|
'steps': self.step_results,
|
|
'action_history': [step['actions'][-1]['content'] for step in self.step_results],
|
|
'screenshot_paths': self.screenshots,
|
|
'final_result_response': (
|
|
last_action['content'] if (last_action := self.step_results[-1]['actions'][-1])['is_done'] else None
|
|
),
|
|
'self_report_completed': self.step_results[-1]['actions'][-1]['is_done'],
|
|
'self_report_success': self.step_results[-1]['actions'][-1]['success'],
|
|
}
|
|
|
|
# Save to file
|
|
with open(self.result_folder / 'result.json', 'w') as f:
|
|
json.dump(formatted_result, f, indent=2)
|
|
|
|
return formatted_result
|
|
|
|
|
|
async def run_agent_with_tracing(task: Task, browser: Browser | None = None, max_steps: int = 25):
|
|
try:
|
|
# Create task tracker
|
|
tracker = TaskTracker(task.task_id, task.confirmed_task)
|
|
|
|
browser = browser or Browser()
|
|
llm = ChatOpenAI(
|
|
model='gpt-4o',
|
|
temperature=0.0,
|
|
)
|
|
|
|
agent = Agent(task=task.confirmed_task, llm=llm, browser=browser)
|
|
|
|
# Pass our hook functions
|
|
result = await agent.run(max_steps=max_steps, on_step_start=tracker.on_step_start, on_step_end=tracker.on_step_end)
|
|
|
|
# Save final results
|
|
final_results = tracker.save_results()
|
|
|
|
return result
|
|
finally:
|
|
# Ensure proper cleanup
|
|
await asyncio.sleep(0.1) # Give a moment for any pending tasks to complete
|
|
if not browser:
|
|
await agent.close() # This will close the browser if we created it
|
|
|
|
|
|
def judge_task_result(model, task_folder: Path, score_threshold: float = 3) -> Dict:
|
|
"""
|
|
Judge a single task result based on the success value of the final action.
|
|
|
|
Args:
|
|
task_folder: Path to the task result folder
|
|
|
|
Returns:
|
|
Dictionary containing judgment results
|
|
"""
|
|
result_file = task_folder / 'result.json'
|
|
if not result_file.exists():
|
|
return {'task_id': task_folder.name, 'judgement': None, 'success': False, 'error': 'No result.json found', 'score': 0.0}
|
|
|
|
try:
|
|
with open(result_file) as f:
|
|
result = json.load(f)
|
|
|
|
# If a Online_Mind2Web_evaluation is already saved, we can skip the eval
|
|
if result.get('Online_Mind2Web_evaluation'):
|
|
return result.get('Online_Mind2Web_evaluation')
|
|
|
|
# Get the screenshot paths, task description, and action history
|
|
screenshot_paths = result.get('screenshot_paths', [])
|
|
task_description = result.get('task')
|
|
action_history = result.get('action_history', [])
|
|
|
|
# Use the retry wrapper for evaluation
|
|
try:
|
|
eval_result = asyncio.run(
|
|
Online_Mind2Web_eval_with_retry(task_description, action_history, screenshot_paths, model, score_threshold)
|
|
)
|
|
|
|
if eval_result is None:
|
|
raise Exception('Evaluation failed after all retries')
|
|
|
|
messages, text, system_msg, record, key_points = eval_result
|
|
|
|
# Final steps to get judgement
|
|
judgement = model.invoke(messages).content
|
|
|
|
if 'success' in judgement.lower().split('status:')[1]: # This is the official criteria for success
|
|
evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': True, 'error': None, 'score': 1.0}
|
|
else: # This is the official criteria for failure
|
|
evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': False, 'error': None, 'score': 0.0}
|
|
|
|
# Save the Online_Mind2Web_evaluation into the result.json file
|
|
result['Online_Mind2Web_evaluation'] = evaluation
|
|
with open(result_file, 'w') as f:
|
|
json.dump(result, f, indent=2)
|
|
|
|
return evaluation
|
|
|
|
except Exception as err:
|
|
return {
|
|
'task_id': task_folder.name,
|
|
'judgement': None,
|
|
'success': False,
|
|
'error': f'{type(err).__name__}: {err}',
|
|
'score': 0.0,
|
|
}
|
|
|
|
except Exception as err:
|
|
return {
|
|
'task_id': task_folder.name,
|
|
'judgement': None,
|
|
'success': False,
|
|
'error': f'{type(err).__name__}: {err}',
|
|
'score': 0.0,
|
|
}
|
|
|
|
|
|
async def evaluate_all_saved_results(args) -> Dict:
|
|
"""
|
|
Evaluate all completed tasks in the saved_trajectories folder.
|
|
|
|
Returns:
|
|
Dictionary containing evaluation summary
|
|
"""
|
|
trajectories_dir = Path('saved_trajectories')
|
|
if not trajectories_dir.is_dir():
|
|
return {'error': 'No saved trajectories found'}
|
|
|
|
# Define the model used as a judge
|
|
model = ChatOpenAI(
|
|
model='gpt-4o',
|
|
temperature=0.0,
|
|
)
|
|
|
|
# Create a semaphore to limit concurrent evaluations
|
|
semaphore = asyncio.Semaphore(args.parallel_evaluations)
|
|
|
|
async def evaluate_task(task_folder: Path) -> Dict:
|
|
async with semaphore:
|
|
judgement = await asyncio.to_thread(judge_task_result, model, task_folder)
|
|
logger.info(f'Completed evaluation for task {task_folder.name}. Result: {judgement["success"]}')
|
|
return judgement
|
|
|
|
# Get all task folders
|
|
task_folders = [f for f in trajectories_dir.iterdir() if f.is_dir()]
|
|
|
|
# Run evaluations in parallel
|
|
judgements = await asyncio.gather(*[evaluate_task(folder) for folder in task_folders])
|
|
|
|
# Calculate summary statistics
|
|
total_tasks = len(judgements)
|
|
successful_tasks = sum(1 for j in judgements if j['success'])
|
|
failed_tasks = sum(1 for j in judgements if not j['success'])
|
|
average_score = sum(j['score'] for j in judgements) / total_tasks if total_tasks > 0 else 0
|
|
|
|
summary = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'total_tasks': total_tasks,
|
|
'successful_tasks': successful_tasks,
|
|
'failed_tasks': failed_tasks,
|
|
'success_rate': successful_tasks / total_tasks if total_tasks > 0 else 0,
|
|
'average_score': average_score,
|
|
'detailed_results': judgements,
|
|
}
|
|
|
|
return summary
|
|
|
|
|
|
async def run_multiple_tasks(
|
|
tasks: list[Task],
|
|
max_parallel_runs: int = 3,
|
|
max_parallel_evaluations: int = 5,
|
|
max_steps_per_task: int = 25,
|
|
start_index: int = 0,
|
|
end_index: Optional[int] = None,
|
|
headless: bool = False,
|
|
) -> Dict:
|
|
"""
|
|
Run multiple tasks in parallel and evaluate results.
|
|
"""
|
|
semaphore_runs = asyncio.Semaphore(max_parallel_runs)
|
|
tasks_to_run = tasks[start_index:end_index] if end_index else tasks[start_index:]
|
|
|
|
async def run_task_with_semaphore(task: Task) -> dict:
|
|
"""Run a single task with semaphore and error handling"""
|
|
async with semaphore_runs:
|
|
# Check if task has already been completed
|
|
task_folder = Path(f'saved_trajectories/{task.task_id}')
|
|
result_file = task_folder / 'result.json'
|
|
|
|
if result_file.exists():
|
|
logger.info(f'Task {task.task_id} already completed, skipping...')
|
|
try:
|
|
with open(result_file) as f:
|
|
existing_result = json.load(f)
|
|
return {
|
|
'task_id': task.task_id,
|
|
'success': True,
|
|
'result': {
|
|
'task_id': task.task_id,
|
|
'task': task.confirmed_task,
|
|
'is_done': existing_result.get('self_report_completed', False),
|
|
'is_successful': existing_result.get('self_report_success', False),
|
|
'final_result': existing_result.get('final_result_response', None),
|
|
'errors': [],
|
|
},
|
|
}
|
|
except Exception as e:
|
|
logger.error(f'Error reading existing result for task {task.task_id}: {str(e)}')
|
|
# If we can't read the existing result, we'll run the task again
|
|
|
|
try:
|
|
logger.info(f'Starting task {task.task_id}')
|
|
# Create browser with headless configuration
|
|
browserConfig = BrowserConfig(headless=headless)
|
|
browser = Browser(config=browserConfig)
|
|
result = await run_agent_with_tracing(task=task, browser=browser, max_steps=max_steps_per_task)
|
|
logger.info(f'Completed task {task.task_id}')
|
|
|
|
# Extract relevant information from the agent history
|
|
task_result = {
|
|
'task_id': task.task_id,
|
|
'success': True,
|
|
'result': {
|
|
'task_id': task.task_id,
|
|
'task': task.confirmed_task,
|
|
# "history": result.model_dump() if result else None,
|
|
'is_done': result.is_done() if result else False,
|
|
'is_successful': result.is_successful() if result else None,
|
|
'final_result': result.final_result() if result else None,
|
|
'errors': result.errors() if result else [],
|
|
}
|
|
if result
|
|
else None,
|
|
}
|
|
return task_result
|
|
except Exception as e:
|
|
logger.error(f'Error in task {task.task_id}: {str(e)}')
|
|
return {'task_id': task.task_id, 'success': False, 'error': str(e)}
|
|
finally:
|
|
await browser.close()
|
|
|
|
# Run all tasks in parallel
|
|
task_results = await asyncio.gather(*(run_task_with_semaphore(task) for task in tasks_to_run))
|
|
|
|
# After all tasks are complete, evaluate the results
|
|
logger.info('All tasks completed. Starting evaluation...')
|
|
|
|
# Create a namespace object to pass parallel_evaluations to evaluate_all_saved_results
|
|
class ArgsNamespace:
|
|
def __init__(self, parallel_evaluations):
|
|
self.parallel_evaluations = parallel_evaluations
|
|
|
|
args = ArgsNamespace(parallel_evaluations=max_parallel_evaluations)
|
|
evaluation_summary = await evaluate_all_saved_results(args)
|
|
|
|
return {
|
|
# "task_results": task_results,
|
|
'evaluation_summary': evaluation_summary
|
|
}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Run and evaluate browser automation tasks')
|
|
parser.add_argument('--parallel_runs', type=int, default=3, help='Number of parallel tasks to run')
|
|
parser.add_argument('--parallel_evaluations', type=int, default=5, help='Number of parallel evaluations to run')
|
|
parser.add_argument('--max-steps', type=int, default=25, help='Maximum steps per task')
|
|
parser.add_argument('--start', type=int, default=0, help='Start index')
|
|
parser.add_argument('--end', type=int, default=None, help='End index (exclusive)')
|
|
parser.add_argument('--headless', action='store_true', help='Run in headless mode')
|
|
parser.add_argument('--evaluate-only', action='store_true', help='Only evaluate existing results without running new tasks')
|
|
args = parser.parse_args()
|
|
|
|
# Set up logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
if args.evaluate_only:
|
|
# Just evaluate existing results
|
|
logger.info('Evaluating existing results...')
|
|
summary = asyncio.run(evaluate_all_saved_results(args))
|
|
|
|
# Save evaluation results
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
eval_file = f'saved_trajectories/evaluation_summary_{timestamp}.json'
|
|
with open(eval_file, 'w') as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
logger.info(f'Evaluation complete. Success rate: {summary["success_rate"]:.2%}')
|
|
logger.info(f'Average score: {summary["average_score"]:.2f}')
|
|
logger.info(f'Full results saved to {eval_file}')
|
|
|
|
else:
|
|
logger.info('Running tasks...')
|
|
# Run tasks and evaluate
|
|
load_dotenv()
|
|
|
|
with open('eval/mind2web_tasks.json', 'r') as f:
|
|
tasks = [Task(**task) for task in json.load(f)]
|
|
|
|
results = asyncio.run(
|
|
run_multiple_tasks(
|
|
tasks=tasks,
|
|
max_parallel_runs=args.parallel_runs,
|
|
max_parallel_evaluations=args.parallel_evaluations,
|
|
max_steps_per_task=args.max_steps,
|
|
start_index=args.start,
|
|
end_index=args.end,
|
|
headless=args.headless,
|
|
)
|
|
)
|
|
|
|
logger.info('Task completed. Saving results...')
|
|
# Save results
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
results_file = f'saved_trajectories/eval_results_{timestamp}.json'
|
|
|
|
# Convert results to JSON-serializable format
|
|
serializable_results = {
|
|
# "task_results": results["task_results"],
|
|
'evaluation_summary': results['evaluation_summary']
|
|
}
|
|
|
|
with open(results_file, 'w') as f:
|
|
json.dump(serializable_results, f, indent=2)
|
|
|
|
# Print summary
|
|
summary = results['evaluation_summary']
|
|
logger.info(f'Completed {summary["total_tasks"]} tasks.')
|
|
logger.info(f'Success rate: {summary["success_rate"]:.2%}')
|
|
logger.info(f'Average score: {summary["average_score"]:.2f}')
|
|
logger.info(f'Results saved to {results_file}')
|