Files
browser-use/eval/service.py
2025-04-16 14:20:36 -07:00

708 lines
27 KiB
Python

# ==============================================================================================================
# Documentation for this evaluation file.
# The import
# Here is the command to run the evaluation:
# python eval/service.py --parallel_runs 5 --parallel_evaluations 5 --max-steps 25 --start 0 --end 100
# options:
# --parallel_runs: Number of parallel tasks to run
# --max-steps: Maximum steps per task
# --start: Start index
# --end: End index (exclusive)
# --headless: Run in headless mode
# Here is the command to run the evaluation only:
# python eval/service.py --evaluate-only
# options:
# --parallel_evaluations: Number of parallel evaluations to run
# To run a new evaluation, you need to first clear the saved_trajectories folder.
# rm -rf saved_trajectories
# Otherwise, the evaluation will continue on from the last saved trajectory.
# ==============================================================================================================
# ==============================================================================================================
# This is the LLM as a judge evaluation system from the OSU-NLP Group paper
# Any adaptiations made should be explicitly stated here:
# Adaptations:
# We are using our langchain wrapper for the OpenAI API
# This means we changed model.generate to model.invoke. The behavior of the model should be identical.
# Added a Online_Mind2Web_eval_with_retry wrapper with retry logic in case of API rate limiting or other issues.
# @article{xue2025illusionprogressassessingcurrent,
# title={An Illusion of Progress? Assessing the Current State of Web Agents},
# author={Tianci Xue and Weijian Qi and Tianneng Shi and Chan Hee Song and Boyu Gou and Dawn Song and Huan Sun and Yu Su},
# year={2025},
# eprint={2504.01382},
# archivePrefix={arXiv},
# primaryClass={cs.AI},
# url={https://arxiv.org/abs/2504.01382},
# }
# @inproceedings{deng2023mind2web,
# author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
# booktitle = {Advances in Neural Information Processing Systems},
# editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
# pages = {28091--28114},
# publisher = {Curran Associates, Inc.},
# title = {Mind2Web: Towards a Generalist Agent for the Web},
# url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf},
# volume = {36},
# year = {2023}
# }
# ==============================================================================================================
import asyncio
import base64
import io
import logging
import re
from PIL import Image
MAX_IMAGE = 5
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def encode_image(image):
"""Convert a PIL image to base64 string."""
if image.mode == 'RGBA':
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='JPEG')
return base64.b64encode(buffered.getvalue()).decode('utf-8')
async def identify_key_points(task, model):
system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description.
**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal.
**Instructions**:
1. Read the task description carefully.
2. Identify and extract **key points** directly stated in the task description.
- A **key point** is a critical element, condition, or step explicitly mentioned in the task description.
- Do not infer or add any unstated elements.
- Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest").
**Respond with**:
- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details."""
prompt = """Task: {task}"""
text = prompt.format(task=task)
messages = [
{'role': 'system', 'content': system_msg},
{
'role': 'user',
'content': [{'type': 'text', 'text': text}],
},
]
response = await asyncio.to_thread(model.invoke, messages)
return response.content
async def judge_image(task, image_path, key_points, model):
system_msg = """You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task.
**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score.
**Instructions**:
1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features.
2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion:
- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions.
- Does the image show actions, progress indicators, or critical information directly related to completing the task?
- Is this information indispensable for understanding or ensuring task success?
- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright.
3. Provide your response in the following format:
- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof.
- **Score**: Assign a score based on the reasoning, using the following scale:
- **1**: The image does not contain any necessary steps or relevant information.
- **2**: The image contains minimal or ambiguous information, unlikely to be essential.
- **3**: The image includes some relevant steps or hints but lacks clarity or completeness.
- **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive.
- **5**: The image clearly displays necessary steps or evidence crucial for completing the task.
Respond with:
1. **Reasoning**: [Your explanation]
2. **Score**: [1-5]"""
jpg_base64_str = encode_image(Image.open(image_path))
prompt = """**Task**: {task}
**Key Points for Task Completion**: {key_points}
The snapshot of the web page is shown in the image."""
text = prompt.format(task=task, key_points=key_points)
messages = [
{'role': 'system', 'content': system_msg},
{
'role': 'user',
'content': [
{'type': 'text', 'text': text},
{
'type': 'image_url',
'image_url': {'url': f'data:image/jpeg;base64,{jpg_base64_str}', 'detail': 'high'},
},
],
},
]
response = await asyncio.to_thread(model.invoke, messages)
return response.content
async def Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold):
system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements.
Your response must strictly follow the following evaluation criteria!
*Important Evaluation Criteria*:
1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful.
2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function).
3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements!
4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow.
Examples of Failure Cases:
- If the requirement is less than $50, but the applied filter is less than $25, it is a failure.
- If the requirement is $1500-$2500, but the applied filter is $2000-$2500, it is a failure.
- If the requirement is $25-$200, but the applied filter is $0-$200, it is a failure.
- If the required years are 2004-2012, but the filter applied is 2001-2012, it is a failure.
- If the required years are before 2015, but the applied filter is 2000-2014, it is a failure.
- If the task requires exactly 2 beds, but the filter applied is 2+ beds, it is a failure.
5: Some tasks require a submission action or a display of results to be considered successful.
6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful.
7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements (e.g., the cheapest or lowest price), the task is still considered successful.
*IMPORTANT*
Format your response into two lines as shown below:
Thoughts: <your thoughts and reasoning process based on double-checking each key points and the evaluation criteria>
Status: "success" or "failure"
"""
prompt = """User Task: {task}
Key Points: {key_points}
Action History:
{last_actions}
The potentially important snapshots of the webpage in the agent's trajectory and their reasons:
{thoughts}"""
key_points = await identify_key_points(task, model)
key_points = key_points.replace('\n\n', '\n')
try:
key_points = key_points.split('**Key Points**:')[1]
key_points = '\n'.join(line.lstrip() for line in key_points.splitlines())
except IndexError:
key_points = key_points.split('Key Points:')[-1]
key_points = '\n'.join(line.lstrip() for line in key_points.splitlines())
tasks = [judge_image(task, image_path, key_points, model) for image_path in images_path]
image_responses = await asyncio.gather(*tasks)
whole_content_img = []
whole_thoughts = []
record = []
pattern = r'[1-5]'
for response, image_path in zip(image_responses, images_path):
try:
score_text = response.split('Score')[1]
thought = response.split('**Reasoning**:')[-1].strip().lstrip('\n').split('\n\n')[0].replace('\n', ' ')
score = re.findall(pattern, score_text)[0]
record.append({'Response': response, 'Score': int(score)})
except Exception as e:
logger.error(f'Error processing response: {e}')
score = 0
record.append({'Response': response, 'Score': 0})
if int(score) >= score_threshold:
jpg_base64_str = encode_image(Image.open(image_path))
whole_content_img.append(
{'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{jpg_base64_str}', 'detail': 'high'}}
)
if thought != '':
whole_thoughts.append(thought)
whole_content_img = whole_content_img[:MAX_IMAGE]
whole_thoughts = whole_thoughts[:MAX_IMAGE]
if len(whole_content_img) == 0:
prompt = """User Task: {task}
Key Points: {key_points}
Action History:
{last_actions}"""
text = prompt.format(
task=task,
last_actions='\n'.join(f'{i + 1}. {action}' for i, action in enumerate(last_actions)),
key_points=key_points,
thoughts='\n'.join(f'{i + 1}. {thought}' for i, thought in enumerate(whole_thoughts)),
)
messages = [
{'role': 'system', 'content': system_msg},
{'role': 'user', 'content': [{'type': 'text', 'text': text}] + whole_content_img},
]
return messages, text, system_msg, record, key_points
async def Online_Mind2Web_eval_with_retry(task, last_actions, images_path, model, score_threshold, max_retries=3):
"""
Wrapper for Online_Mind2Web_eval with retry logic.
Args:
task: The task description
last_actions: list of actions taken
images_path: list of image paths
model: The model to use for evaluation
score_threshold: Score threshold for image filtering
max_retries: Maximum number of retry attempts
Returns:
Tuple of (messages, text, system_msg, record, key_points) or None if all retries fail
"""
for attempt in range(max_retries):
try:
return await Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold)
except Exception as e:
if attempt == max_retries - 1: # Last attempt
logger.error(f'Failed to evaluate after {max_retries} attempts. Error: {str(e)}')
raise
logger.warning(f'Attempt {attempt + 1} failed. Retrying... Error: {str(e)}')
await asyncio.sleep(2**attempt) # Exponential backoff
# ==============================================================================================================
# ==============================================================================================================
# A service for evaluating the performance of the agent
# ==============================================================================================================
import argparse
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser, BrowserConfig
class Task:
def __init__(self, task_id, confirmed_task, website, reference_length, level):
self.task_id = task_id
self.confirmed_task = confirmed_task
self.website = website
self.reference_length = reference_length
self.level = level
def __str__(self):
return f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level})'
def __repr__(self):
return self.__str__()
class TaskTracker:
def __init__(self, task_id: str, task_text: str):
self.task_id = task_id
self.task_text = task_text
self.result_folder = Path(f'saved_trajectories/{task_id}')
self.trajectory_folder = self.result_folder / 'trajectory'
self.step_results = []
self.step_counter = 0
self.screenshots = []
self.setup_folders()
def setup_folders(self):
"""Create the necessary folder structure"""
self.result_folder.mkdir(parents=True, exist_ok=True)
self.trajectory_folder.mkdir(parents=True, exist_ok=True)
async def on_step_start(self, agent):
"""Record information at the start of a step"""
self.current_step = {'step_number': self.step_counter, 'start_time': datetime.now().isoformat(), 'actions': []}
async def on_step_end(self, agent):
"""Record information at the end of a step"""
# Take screenshot
browser_context = agent.browser_context
screenshot_b64 = await browser_context.take_screenshot()
screenshot_path = self.trajectory_folder / f'step_{self.step_counter}.png'
# Save screenshot to file
with open(screenshot_path, 'wb') as f:
f.write(base64.b64decode(screenshot_b64))
# Save screenshot path
self.screenshots.append(str(screenshot_path))
# Record action and result
if agent.state.last_result:
for result in agent.state.last_result:
self.current_step['actions'].append(
{
'content': result.extracted_content,
'error': result.error,
'is_done': result.is_done,
'success': result.success,
}
)
# Record end time
self.current_step['end_time'] = datetime.now().isoformat()
self.current_step['screenshot_path'] = str(screenshot_path)
# Add to step results
self.step_results.append(self.current_step)
self.step_counter += 1
# Save intermediate results
self.save_results() # Save progress after each step
def save_results(self):
"""Save the consolidated results"""
# Create the final result object
formatted_result = {
'task_id': self.task_id,
'task': self.task_text,
'steps': self.step_results,
'action_history': [step['actions'][-1]['content'] for step in self.step_results],
'screenshot_paths': self.screenshots,
'final_result_response': (
last_action['content'] if (last_action := self.step_results[-1]['actions'][-1])['is_done'] else None
),
'self_report_completed': self.step_results[-1]['actions'][-1]['is_done'],
'self_report_success': self.step_results[-1]['actions'][-1]['success'],
}
# Save to file
with open(self.result_folder / 'result.json', 'w') as f:
json.dump(formatted_result, f, indent=2)
return formatted_result
async def run_agent_with_tracing(task: Task, browser: Browser | None = None, max_steps: int = 25):
try:
# Create task tracker
tracker = TaskTracker(task.task_id, task.confirmed_task)
browser = browser or Browser()
llm = ChatOpenAI(
model='gpt-4o',
temperature=0.0,
)
agent = Agent(task=task.confirmed_task, llm=llm, browser=browser)
# Pass our hook functions
result = await agent.run(max_steps=max_steps, on_step_start=tracker.on_step_start, on_step_end=tracker.on_step_end)
# Save final results
final_results = tracker.save_results()
return result
finally:
# Ensure proper cleanup
await asyncio.sleep(0.1) # Give a moment for any pending tasks to complete
if not browser:
await agent.close() # This will close the browser if we created it
def judge_task_result(model, task_folder: Path, score_threshold: float = 3) -> Dict:
"""
Judge a single task result based on the success value of the final action.
Args:
task_folder: Path to the task result folder
Returns:
Dictionary containing judgment results
"""
result_file = task_folder / 'result.json'
if not result_file.exists():
return {'task_id': task_folder.name, 'judgement': None, 'success': False, 'error': 'No result.json found', 'score': 0.0}
try:
with open(result_file) as f:
result = json.load(f)
# If a Online_Mind2Web_evaluation is already saved, we can skip the eval
if result.get('Online_Mind2Web_evaluation'):
return result.get('Online_Mind2Web_evaluation')
# Get the screenshot paths, task description, and action history
screenshot_paths = result.get('screenshot_paths', [])
task_description = result.get('task')
action_history = result.get('action_history', [])
# Use the retry wrapper for evaluation
try:
eval_result = asyncio.run(
Online_Mind2Web_eval_with_retry(task_description, action_history, screenshot_paths, model, score_threshold)
)
if eval_result is None:
raise Exception('Evaluation failed after all retries')
messages, text, system_msg, record, key_points = eval_result
# Final steps to get judgement
judgement = model.invoke(messages).content
if 'success' in judgement.lower().split('status:')[1]: # This is the official criteria for success
evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': True, 'error': None, 'score': 1.0}
else: # This is the official criteria for failure
evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': False, 'error': None, 'score': 0.0}
# Save the Online_Mind2Web_evaluation into the result.json file
result['Online_Mind2Web_evaluation'] = evaluation
with open(result_file, 'w') as f:
json.dump(result, f, indent=2)
return evaluation
except Exception as err:
return {
'task_id': task_folder.name,
'judgement': None,
'success': False,
'error': f'{type(err).__name__}: {err}',
'score': 0.0,
}
except Exception as err:
return {
'task_id': task_folder.name,
'judgement': None,
'success': False,
'error': f'{type(err).__name__}: {err}',
'score': 0.0,
}
async def evaluate_all_saved_results(args) -> Dict:
"""
Evaluate all completed tasks in the saved_trajectories folder.
Returns:
Dictionary containing evaluation summary
"""
trajectories_dir = Path('saved_trajectories')
if not trajectories_dir.is_dir():
return {'error': 'No saved trajectories found'}
# Define the model used as a judge
model = ChatOpenAI(
model='gpt-4o',
temperature=0.0,
)
# Create a semaphore to limit concurrent evaluations
semaphore = asyncio.Semaphore(args.parallel_evaluations)
async def evaluate_task(task_folder: Path) -> Dict:
async with semaphore:
judgement = await asyncio.to_thread(judge_task_result, model, task_folder)
logger.info(f'Completed evaluation for task {task_folder.name}. Result: {judgement["success"]}')
return judgement
# Get all task folders
task_folders = [f for f in trajectories_dir.iterdir() if f.is_dir()]
# Run evaluations in parallel
judgements = await asyncio.gather(*[evaluate_task(folder) for folder in task_folders])
# Calculate summary statistics
total_tasks = len(judgements)
successful_tasks = sum(1 for j in judgements if j['success'])
failed_tasks = sum(1 for j in judgements if not j['success'])
average_score = sum(j['score'] for j in judgements) / total_tasks if total_tasks > 0 else 0
summary = {
'timestamp': datetime.now().isoformat(),
'total_tasks': total_tasks,
'successful_tasks': successful_tasks,
'failed_tasks': failed_tasks,
'success_rate': successful_tasks / total_tasks if total_tasks > 0 else 0,
'average_score': average_score,
'detailed_results': judgements,
}
return summary
async def run_multiple_tasks(
tasks: list[Task],
max_parallel_runs: int = 3,
max_parallel_evaluations: int = 5,
max_steps_per_task: int = 25,
start_index: int = 0,
end_index: Optional[int] = None,
headless: bool = False,
) -> Dict:
"""
Run multiple tasks in parallel and evaluate results.
"""
semaphore_runs = asyncio.Semaphore(max_parallel_runs)
tasks_to_run = tasks[start_index:end_index] if end_index else tasks[start_index:]
async def run_task_with_semaphore(task: Task) -> dict:
"""Run a single task with semaphore and error handling"""
async with semaphore_runs:
# Check if task has already been completed
task_folder = Path(f'saved_trajectories/{task.task_id}')
result_file = task_folder / 'result.json'
if result_file.exists():
logger.info(f'Task {task.task_id} already completed, skipping...')
try:
with open(result_file) as f:
existing_result = json.load(f)
return {
'task_id': task.task_id,
'success': True,
'result': {
'task_id': task.task_id,
'task': task.confirmed_task,
'is_done': existing_result.get('self_report_completed', False),
'is_successful': existing_result.get('self_report_success', False),
'final_result': existing_result.get('final_result_response', None),
'errors': [],
},
}
except Exception as e:
logger.error(f'Error reading existing result for task {task.task_id}: {str(e)}')
# If we can't read the existing result, we'll run the task again
try:
logger.info(f'Starting task {task.task_id}')
# Create browser with headless configuration
browserConfig = BrowserConfig(headless=headless)
browser = Browser(config=browserConfig)
result = await run_agent_with_tracing(task=task, browser=browser, max_steps=max_steps_per_task)
logger.info(f'Completed task {task.task_id}')
# Extract relevant information from the agent history
task_result = {
'task_id': task.task_id,
'success': True,
'result': {
'task_id': task.task_id,
'task': task.confirmed_task,
# "history": result.model_dump() if result else None,
'is_done': result.is_done() if result else False,
'is_successful': result.is_successful() if result else None,
'final_result': result.final_result() if result else None,
'errors': result.errors() if result else [],
}
if result
else None,
}
return task_result
except Exception as e:
logger.error(f'Error in task {task.task_id}: {str(e)}')
return {'task_id': task.task_id, 'success': False, 'error': str(e)}
finally:
await browser.close()
# Run all tasks in parallel
task_results = await asyncio.gather(*(run_task_with_semaphore(task) for task in tasks_to_run))
# After all tasks are complete, evaluate the results
logger.info('All tasks completed. Starting evaluation...')
# Create a namespace object to pass parallel_evaluations to evaluate_all_saved_results
class ArgsNamespace:
def __init__(self, parallel_evaluations):
self.parallel_evaluations = parallel_evaluations
args = ArgsNamespace(parallel_evaluations=max_parallel_evaluations)
evaluation_summary = await evaluate_all_saved_results(args)
return {
# "task_results": task_results,
'evaluation_summary': evaluation_summary
}
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run and evaluate browser automation tasks')
parser.add_argument('--parallel_runs', type=int, default=3, help='Number of parallel tasks to run')
parser.add_argument('--parallel_evaluations', type=int, default=5, help='Number of parallel evaluations to run')
parser.add_argument('--max-steps', type=int, default=25, help='Maximum steps per task')
parser.add_argument('--start', type=int, default=0, help='Start index')
parser.add_argument('--end', type=int, default=None, help='End index (exclusive)')
parser.add_argument('--headless', action='store_true', help='Run in headless mode')
parser.add_argument('--evaluate-only', action='store_true', help='Only evaluate existing results without running new tasks')
args = parser.parse_args()
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
if args.evaluate_only:
# Just evaluate existing results
logger.info('Evaluating existing results...')
summary = asyncio.run(evaluate_all_saved_results(args))
# Save evaluation results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
eval_file = f'saved_trajectories/evaluation_summary_{timestamp}.json'
with open(eval_file, 'w') as f:
json.dump(summary, f, indent=2)
logger.info(f'Evaluation complete. Success rate: {summary["success_rate"]:.2%}')
logger.info(f'Average score: {summary["average_score"]:.2f}')
logger.info(f'Full results saved to {eval_file}')
else:
logger.info('Running tasks...')
# Run tasks and evaluate
load_dotenv()
with open('eval/mind2web_tasks.json', 'r') as f:
tasks = [Task(**task) for task in json.load(f)]
results = asyncio.run(
run_multiple_tasks(
tasks=tasks,
max_parallel_runs=args.parallel_runs,
max_parallel_evaluations=args.parallel_evaluations,
max_steps_per_task=args.max_steps,
start_index=args.start,
end_index=args.end,
headless=args.headless,
)
)
logger.info('Task completed. Saving results...')
# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_file = f'saved_trajectories/eval_results_{timestamp}.json'
# Convert results to JSON-serializable format
serializable_results = {
# "task_results": results["task_results"],
'evaluation_summary': results['evaluation_summary']
}
with open(results_file, 'w') as f:
json.dump(serializable_results, f, indent=2)
# Print summary
summary = results['evaluation_summary']
logger.info(f'Completed {summary["total_tasks"]} tasks.')
logger.info(f'Success rate: {summary["success_rate"]:.2%}')
logger.info(f'Average score: {summary["average_score"]:.2f}')
logger.info(f'Results saved to {results_file}')