browser-use/eval/service.py

# ==============================================================================================================
# Documentation for this evaluation file.
# The import


# Here is the command to run the evaluation:
# python eval/service.py --parallel_runs 5 --parallel_evaluations 5 --max-steps 25 --start 0 --end 100 --model gpt-4o
# options:
# --parallel_runs: Number of parallel tasks to run
# --max-steps: Maximum steps per task
# --start: Start index
# --end: End index (exclusive)
# --headless: Run in headless mode

# Here is the command to run the evaluation only:
# python eval/service.py --evaluate-only
# options:
# --parallel_evaluations: Number of parallel evaluations to run

# ==============================================================================================================


# ==============================================================================================================
# This is the LLM as a judge evaluation system from the OSU-NLP Group paper
# Any adaptiations made should be explicitly stated here:
# Adaptations:
# We are using our langchain wrapper for the OpenAI API
# This means we changed model.generate to model.invoke. The behavior of the model should be identical.
# Added a Online_Mind2Web_eval_with_retry wrapper with retry logic in case of API rate limiting or other issues.


# @article{xue2025illusionprogressassessingcurrent,
#       title={An Illusion of Progress? Assessing the Current State of Web Agents},
#       author={Tianci Xue and Weijian Qi and Tianneng Shi and Chan Hee Song and Boyu Gou and Dawn Song and Huan Sun and Yu Su},
#       year={2025},
#       eprint={2504.01382},
#       archivePrefix={arXiv},
#       primaryClass={cs.AI},
#       url={https://arxiv.org/abs/2504.01382},
# }

# @inproceedings{deng2023mind2web,
#  author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
#  booktitle = {Advances in Neural Information Processing Systems},
#  editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
#  pages = {28091--28114},
#  publisher = {Curran Associates, Inc.},
#  title = {Mind2Web: Towards a Generalist Agent for the Web},
#  url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf},
#  volume = {36},
#  year = {2023}
# }
# ==============================================================================================================
import asyncio
import base64
import io
import logging
import re
import shutil

import anyio
from PIL import Image

MAX_IMAGE = 5

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def encode_image(image):
	"""Convert a PIL image to base64 string."""
	if image.mode == 'RGBA':
		image = image.convert('RGB')
	buffered = io.BytesIO()
	image.save(buffered, format='JPEG')
	return base64.b64encode(buffered.getvalue()).decode('utf-8')


async def identify_key_points(task, model):
	system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description.

**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal.

**Instructions**:
1. Read the task description carefully.
2. Identify and extract **key points** directly stated in the task description.
   - A **key point** is a critical element, condition, or step explicitly mentioned in the task description.
   - Do not infer or add any unstated elements.
   - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest").

**Respond with**:
- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details."""
	prompt = """Task: {task}"""
	text = prompt.format(task=task)
	messages = [
		{'role': 'system', 'content': system_msg},
		{
			'role': 'user',
			'content': [{'type': 'text', 'text': text}],
		},
	]
	response = await asyncio.to_thread(model.invoke, messages)
	return response.content


async def judge_image(task, image_path, key_points, model):
	system_msg = """You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task.

**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score.

**Instructions**:
1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features.

2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion:
- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions.
- Does the image show actions, progress indicators, or critical information directly related to completing the task?
- Is this information indispensable for understanding or ensuring task success?
- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright.

3. Provide your response in the following format:
- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof.
- **Score**: Assign a score based on the reasoning, using the following scale:
    - **1**: The image does not contain any necessary steps or relevant information.
    - **2**: The image contains minimal or ambiguous information, unlikely to be essential.
    - **3**: The image includes some relevant steps or hints but lacks clarity or completeness.
    - **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive.
    - **5**: The image clearly displays necessary steps or evidence crucial for completing the task.

Respond with:
1. **Reasoning**: [Your explanation]
2. **Score**: [1-5]"""

	jpg_base64_str = encode_image(Image.open(image_path))

	prompt = """**Task**: {task}

**Key Points for Task Completion**: {key_points}

The snapshot of the web page is shown in the image."""
	text = prompt.format(task=task, key_points=key_points)

	messages = [
		{'role': 'system', 'content': system_msg},
		{
			'role': 'user',
			'content': [
				{'type': 'text', 'text': text},
				{
					'type': 'image_url',
					'image_url': {'url': f'data:image/jpeg;base64,{jpg_base64_str}', 'detail': 'high'},
				},
			],
		},
	]
	response = await asyncio.to_thread(model.invoke, messages)
	return response.content


async def Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold):
	system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements.

Your response must strictly follow the following evaluation criteria!
*Important Evaluation Criteria*:
1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful.
2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function).
3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements!
4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow.
Examples of Failure Cases:
- If the requirement is less than $50, but the applied filter is less than $25, it is a failure.
- If the requirement is $1500-$2500, but the applied filter is $2000-$2500, it is a failure.
- If the requirement is $25-$200, but the applied filter is $0-$200, it is a failure.
- If the required years are 2004-2012, but the filter applied is 2001-2012, it is a failure.
- If the required years are before 2015, but the applied filter is 2000-2014, it is a failure.
- If the task requires exactly 2 beds, but the filter applied is 2+ beds, it is a failure.
5: Some tasks require a submission action or a display of results to be considered successful.
6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful.
7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements (e.g., the cheapest or lowest price), the task is still considered successful.

*IMPORTANT*
Format your response into two lines as shown below:

Thoughts: <your thoughts and reasoning process based on double-checking each key points and the evaluation criteria>
Status: "success" or "failure"
"""
	prompt = """User Task: {task}

Key Points: {key_points}

Action History:
{last_actions}

The potentially important snapshots of the webpage in the agent's trajectory and their reasons:
{thoughts}"""

	key_points = await identify_key_points(task, model)
	key_points = key_points.replace('\n\n', '\n')

	try:
		key_points = key_points.split('**Key Points**:')[1]
		key_points = '\n'.join(line.lstrip() for line in key_points.splitlines())
	except IndexError:
		key_points = key_points.split('Key Points:')[-1]
		key_points = '\n'.join(line.lstrip() for line in key_points.splitlines())

	tasks = [judge_image(task, image_path, key_points, model) for image_path in images_path]
	image_responses = await asyncio.gather(*tasks)

	whole_content_img = []
	whole_thoughts = []
	record = []
	pattern = r'[1-5]'
	for response, image_path in zip(image_responses, images_path):
		try:
			score_text = response.split('Score')[1]
			thought = response.split('**Reasoning**:')[-1].strip().lstrip('\n').split('\n\n')[0].replace('\n', ' ')
			score = re.findall(pattern, score_text)[0]
			record.append({'Response': response, 'Score': int(score)})
		except Exception as e:
			logger.error(f'Error processing response: {type(e).__name__}: {e}')
			score = 0
			record.append({'Response': response, 'Score': 0})

		if int(score) >= score_threshold:
			jpg_base64_str = encode_image(Image.open(image_path))
			whole_content_img.append(
				{'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{jpg_base64_str}', 'detail': 'high'}}
			)
			if thought != '':
				whole_thoughts.append(thought)

	whole_content_img = whole_content_img[:MAX_IMAGE]
	whole_thoughts = whole_thoughts[:MAX_IMAGE]
	if len(whole_content_img) == 0:
		prompt = """User Task: {task}

Key Points: {key_points}

Action History:
{last_actions}"""
	text = prompt.format(
		task=task,
		last_actions='\n'.join(f'{i + 1}. {action}' for i, action in enumerate(last_actions)),
		key_points=key_points,
		thoughts='\n'.join(f'{i + 1}. {thought}' for i, thought in enumerate(whole_thoughts)),
	)

	messages = [
		{'role': 'system', 'content': system_msg},
		{'role': 'user', 'content': [{'type': 'text', 'text': text}] + whole_content_img},
	]
	return messages, text, system_msg, record, key_points


async def Online_Mind2Web_eval_with_retry(task, last_actions, images_path, model, score_threshold, max_retries=3):
	"""
	Wrapper for Online_Mind2Web_eval with retry logic.

	Args:
	    task: The task description
	    last_actions: list of actions taken
	    images_path: list of image paths
	    model: The model to use for evaluation
	    score_threshold: Score threshold for image filtering
	    max_retries: Maximum number of retry attempts

	Returns:
	    Tuple of (messages, text, system_msg, record, key_points) or None if all retries fail
	"""
	for attempt in range(max_retries):
		try:
			return await Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold)
		except Exception as e:
			if attempt == max_retries - 1:  # Last attempt
				logger.error(f'Failed to evaluate after {max_retries} attempts. Error: {type(e).__name__}: {str(e)}')
				raise
			logger.warning(f'Attempt {attempt + 1} failed. Retrying... Error: {type(e).__name__}: {str(e)}')
			await asyncio.sleep(2**attempt)  # Exponential backoff


# ==============================================================================================================


# ==============================================================================================================
# A service for evaluating the performance of the agent
# ==============================================================================================================
import argparse
import json
import os
import subprocess
import time
from datetime import datetime
from pathlib import Path

import requests
from dotenv import load_dotenv
from langchain_anthropic import ChatAnthropic
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from pydantic.types import SecretStr

from browser_use import Agent, Browser, BrowserConfig

SUPPORTED_MODELS = {
	# Anthropic
	'claude-3.5-sonnet': {
		'provider': 'anthropic',
		'model_name': 'claude-3-5-sonnet-20240620',
		'api_key_env': 'ANTHROPIC_API_KEY',
	},
	'claude-3.5-sonnet-exp': {
		'provider': 'anthropic',
		'model_name': 'claude-3-5-sonnet-20241022',
		'api_key_env': 'ANTHROPIC_API_KEY',
	},
	'claude-3.7-sonnet-exp': {
		'provider': 'anthropic',
		'model_name': 'claude-3-7-sonnet-20250219',
		'api_key_env': 'ANTHROPIC_API_KEY',
	},
	# Deepseek (via OpenAI Compatible API)
	'deepseek-reasoner': {
		'provider': 'openai_compatible',
		'model_name': 'deepseek-reasoner',
		'base_url': 'https://api.deepseek.com/v1',
		'api_key_env': 'DEEPSEEK_API_KEY',
	},
	'deepseek-chat': {
		'provider': 'openai_compatible',
		'model_name': 'deepseek-chat',
		'base_url': 'https://api.deepseek.com/v1',
		'api_key_env': 'DEEPSEEK_API_KEY',
	},
	# Google
	'gemini-1.5-flash': {'provider': 'google', 'model_name': 'gemini-1.5-flash-latest', 'api_key_env': 'GEMINI_API_KEY'},
	'gemini-2.0-flash-exp': {'provider': 'google', 'model_name': 'gemini-2.0-flash-exp', 'api_key_env': 'GEMINI_API_KEY'},
	'gemini-2.5-pro': {'provider': 'google', 'model_name': 'gemini-2.5-pro-preview-03-25', 'api_key_env': 'GEMINI_API_KEY'},
	# OpenAI
	'gpt-4.1': {'provider': 'openai', 'model_name': 'gpt-4.1-2025-04-14', 'api_key_env': 'OPENAI_API_KEY'},
	'gpt-4o': {'provider': 'openai', 'model_name': 'gpt-4o', 'api_key_env': 'OPENAI_API_KEY'},
	'gpt-4o-mini': {'provider': 'openai', 'model_name': 'gpt-4o-mini', 'api_key_env': 'OPENAI_API_KEY'},
	# X.ai (via OpenAI Compatible API)
	'grok-2': {
		'provider': 'openai_compatible',
		'model_name': 'grok-2-1212',
		'base_url': 'https://api.x.ai/v1',
		'api_key_env': 'XAI_API_KEY',
	},
	'grok-3': {
		'provider': 'openai_compatible',
		'model_name': 'grok-3-beta',
		'base_url': 'https://api.x.ai/v1',
		'api_key_env': 'XAI_API_KEY',
	},
}


def get_llm(model_name: str):
	"""Instantiates the correct LangChain ChatModel based on the model name."""
	if model_name not in SUPPORTED_MODELS:
		raise ValueError(f'Unsupported model: {model_name}. Supported models are: {list(SUPPORTED_MODELS.keys())}')

	config = SUPPORTED_MODELS[model_name]
	provider = config['provider']
	api_key_env = config.get('api_key_env')
	api_key = os.getenv(api_key_env) if api_key_env else None

	if not api_key and api_key_env:
		logger.warning(
			f'API key environment variable {api_key_env} not found or empty for model {model_name}. Trying without API key if possible.'
		)
		api_key = None

	api_key_secret = SecretStr(api_key) if api_key else None

	if provider == 'openai':
		kwargs = {
			'model': config['model_name'],
			'temperature': 0.0,
		}
		if api_key_secret:
			kwargs['api_key'] = api_key_secret
		return ChatOpenAI(**kwargs)
	elif provider == 'anthropic':
		# Note: Anthropic client often uses env var ANTHROPIC_API_KEY directly if api_key=None
		kwargs = {
			'model_name': config['model_name'],
			'temperature': 0.0,
			'timeout': 100,
			'stop': None,
		}
		if api_key_secret:
			kwargs['api_key'] = api_key_secret
		return ChatAnthropic(**kwargs)
	elif provider == 'google':
		# Note: Google client often uses env var GEMINI_API_KEY directly if api_key=None
		kwargs = {
			'model': config['model_name'],
			'temperature': 0.0,
		}
		if api_key_secret:
			kwargs['api_key'] = api_key_secret
		return ChatGoogleGenerativeAI(**kwargs)
	elif provider == 'openai_compatible':
		# Note: OpenAI client often uses env var OPENAI_API_KEY directly if api_key=None and no base_url specified
		# Providing base_url requires explicitly passing the key for that endpoint.
		kwargs = {
			'model': config['model_name'],
			'base_url': config['base_url'],
			'temperature': 0.0,
		}
		if api_key_secret:
			kwargs['api_key'] = api_key_secret
		# Ensure api_key is provided if base_url is set and key exists
		elif config.get('base_url'):
			# If base_url is present but key is missing, we might still error depending on the endpoint's auth requirements.
			# Log a warning here, the constructor will likely raise an error if the key is truly required.
			logger.warning(
				f'API key for {model_name} at {config["base_url"]} is missing, but base_url is specified. Authentication may fail.'
			)
		return ChatOpenAI(**kwargs)
	else:
		raise ValueError(f'Unknown provider: {provider}')


class Task:
	def __init__(self, task_id, confirmed_task, website, reference_length, level):
		self.task_id = task_id
		self.confirmed_task = confirmed_task
		self.website = website
		self.reference_length = reference_length
		self.level = level

	def __str__(self):
		return f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level})'

	def __repr__(self):
		return self.__str__()


class TaskTracker:
	def __init__(self, task_id: str, task_text: str, run_id: str):
		self.task_id = task_id
		self.task_text = task_text
		self.run_id = run_id
		self.result_folder = Path(f'saved_trajectories/{task_id}')
		self.trajectory_folder = self.result_folder / 'trajectory'
		self.step_results = []
		self.step_counter = 0
		self.screenshots = []
		self.setup_folders()

	def setup_folders(self):
		"""Create the necessary folder structure"""
		self.result_folder.mkdir(parents=True, exist_ok=True)
		self.trajectory_folder.mkdir(parents=True, exist_ok=True)

	async def on_step_start(self, agent):
		"""Record information at the start of a step"""
		self.current_step = {'step_number': self.step_counter, 'start_time': datetime.now().isoformat(), 'actions': []}

	async def on_step_end(self, agent):
		"""Record information at the end of a step"""
		# Take screenshot
		browser_context = agent.browser_context
		screenshot_b64 = await browser_context.take_screenshot()
		screenshot_path = self.trajectory_folder / f'step_{self.step_counter}.png'

		# Save screenshot to file
		async with await anyio.open_file(screenshot_path, 'wb') as f:
			await f.write(base64.b64decode(screenshot_b64))

		# Save screenshot path
		self.screenshots.append(str(screenshot_path))

		# Record action and result
		if agent.state.last_result:
			for result in agent.state.last_result:
				self.current_step['actions'].append(
					{
						'content': result.extracted_content,
						'error': result.error,
						'is_done': result.is_done,
						'success': result.success,
					}
				)

		# Record end time
		self.current_step['end_time'] = datetime.now().isoformat()
		self.current_step['screenshot_path'] = str(screenshot_path)

		# Add to step results
		self.step_results.append(self.current_step)
		self.step_counter += 1

		# Save intermediate results
		self.save_results()  # Save progress after each step

	def save_results(self):
		"""Save the consolidated results"""
		# Create the final result object

		# Ensure action history contains only strings, replacing None with "None"
		action_history = []
		for step in self.step_results:
			if step['actions']:
				content = step['actions'][-1]['content']
				action_history.append(content if content is not None else 'None')
			else:
				action_history.append('None')  # Handle steps with no actions

		formatted_result = {
			'task_id': self.task_id,
			'run_id': self.run_id,
			'task': self.task_text,
			'steps': self.step_results,
			'action_history': action_history,  # Use the cleaned list
			'screenshot_paths': self.screenshots,
			'final_result_response': (
				last_action['content'] if (last_action := self.step_results[-1]['actions'][-1])['is_done'] else None
			),
			'self_report_completed': self.step_results[-1]['actions'][-1]['is_done']
			if self.step_results and self.step_results[-1]['actions']
			else False,
			'self_report_success': self.step_results[-1]['actions'][-1]['success']
			if self.step_results and self.step_results[-1]['actions']
			else None,
		}

		# Save to file
		with open(self.result_folder / 'result.json', 'w') as f:
			json.dump(formatted_result, f, indent=2)

		return formatted_result


async def run_agent_with_tracing(
	task: Task, llm: BaseChatModel, run_id: str, browser: Browser | None = None, max_steps: int = 25, use_vision: bool = True
):
	try:
		# Create task tracker
		tracker = TaskTracker(task.task_id, task.confirmed_task, run_id)

		browser = browser or Browser()

		agent = Agent(
			task=task.confirmed_task,
			llm=llm,
			browser=browser,
			use_vision=use_vision,
			source='eval_platform',  # Override source detection
		)

		# Pass our hook functions
		result = await agent.run(max_steps=max_steps, on_step_start=tracker.on_step_start, on_step_end=tracker.on_step_end)

		# Save final results
		final_results = tracker.save_results()

		return result
	finally:
		# Ensure proper cleanup
		await asyncio.sleep(0.1)  # Give a moment for any pending tasks to complete
		if not browser:
			await agent.close()  # This will close the browser if we created it


async def judge_task_result(model, task_folder: Path, score_threshold: float = 3) -> dict:
	"""
	Judge a single task result based on the success value of the final action.

	Args:
	    task_folder: Path to the task result folder

	Returns:
	    Dictionary containing judgment results
	"""
	result_file = task_folder / 'result.json'
	if not result_file.exists():
		return {'task_id': task_folder.name, 'judgement': None, 'success': False, 'error': 'No result.json found', 'score': 0.0}

	try:
		async with await anyio.open_file(result_file) as f:
			result = json.loads(await f.read())

		# If a Online_Mind2Web_evaluation is already saved, we can skip the eval
		if result.get('Online_Mind2Web_evaluation'):
			return result.get('Online_Mind2Web_evaluation')

		# Get the screenshot paths, task description, and action history
		screenshot_paths = result.get('screenshot_paths', [])
		task_description = result.get('task')
		action_history = result.get('action_history', [])

		# Use the retry wrapper for evaluation
		try:
			# Await the async function directly instead of using asyncio.run()
			eval_result = await Online_Mind2Web_eval_with_retry(
				task_description, action_history, screenshot_paths, model, score_threshold
			)

			if eval_result is None:
				raise Exception('Evaluation failed after all retries')

			messages, text, system_msg, record, key_points = eval_result

			# Final steps to get judgement - run invoke in a thread
			judgement_msg = await asyncio.to_thread(model.invoke, messages)
			judgement = judgement_msg.content

			if 'success' in judgement.lower().split('status:')[1]:  # This is the official criteria for success
				evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': True, 'error': None, 'score': 1.0}
			else:  # This is the official criteria for failure
				evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': False, 'error': None, 'score': 0.0}

			# Save the Online_Mind2Web_evaluation into the result.json file
			result['Online_Mind2Web_evaluation'] = evaluation
			with anyio.open_file(result_file, 'w') as f:
				await f.write(json.dumps(result, indent=2))

			return evaluation

		except Exception as err:
			return {
				'task_id': task_folder.name,
				'judgement': None,
				'success': False,
				'error': f'{type(err).__name__}: {err}',
				'score': 0.0,
			}

	except Exception as err:
		return {
			'task_id': task_folder.name,
			'judgement': None,
			'success': False,
			'error': f'{type(err).__name__}: {err}',
			'score': 0.0,
		}


def calculate_local_summary(results_dir: str | None = None) -> dict:
	"""
	Calculates a summary of task results by reading the saved result.json files.
	Does not make any network requests.

	Args:
		results_dir: Directory where task results are stored (default: 'saved_trajectories')

	Returns:
		Dictionary containing total_tasks, successful_tasks, success_rate, and average_score
	"""
	if results_dir is None:
		results_dir = 'saved_trajectories'

	path = Path(results_dir)
	if not path.is_dir():
		logger.warning(f'Results directory {results_dir} does not exist')
		return {
			'timestamp': datetime.now().isoformat(),
			'total_tasks': 0,
			'successful_tasks': 0,
			'failed_tasks': 0,
			'success_rate': 0,
			'average_score': 0,
		}

	# Collect all task folders
	task_folders = [f for f in path.iterdir() if f.is_dir()]
	total_tasks = len(task_folders)
	successful_tasks = 0
	total_score = 0.0
	results_with_score = 0

	for folder in task_folders:
		result_file = folder / 'result.json'
		if result_file.exists():
			try:
				with open(result_file) as f:
					result_data = json.load(f)

				# Look for evaluation data
				evaluation = result_data.get('Online_Mind2Web_evaluation', {})
				if evaluation:
					if evaluation.get('success', False):
						successful_tasks += 1

					score = evaluation.get('score', 0.0)
					if score > 0:
						total_score += score
						results_with_score += 1
			except Exception as e:
				logger.error(f'Error reading result file {result_file}: {type(e).__name__}: {e}')

	# Calculate statistics
	failed_tasks = total_tasks - successful_tasks
	success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0
	average_score = total_score / results_with_score if results_with_score > 0 else 0

	return {
		'timestamp': datetime.now().isoformat(),
		'total_tasks': total_tasks,
		'successful_tasks': successful_tasks,
		'failed_tasks': failed_tasks,
		'success_rate': success_rate,
		'average_score': average_score,
	}


async def run_task_with_semaphore(
	task: Task,
	run_id: str,
	convex_url: str,
	secret_key: str,
	eval_model: BaseChatModel,
	llm: BaseChatModel,
	max_steps_per_task: int,
	headless: bool,
	use_vision: bool,
	semaphore_runs: asyncio.Semaphore,  # Pass semaphore as argument
) -> dict:
	"""Run a single task with semaphore, sequential execution, and robust error handling"""
	# Acquire semaphore before starting any task-specific logic
	async with semaphore_runs:
		# --- Initialize State & Payload ---
		task_folder = Path(f'saved_trajectories/{task.task_id}')
		result_file = task_folder / 'result.json'

		# Flags to track progress and errors
		execution_needed = True
		execution_succeeded = False
		evaluation_needed = True
		evaluation_succeeded = True  # Default to True, set to False if eval is needed but fails
		local_processing_error = None

		# Initialize the payload with basic info and default failure/unevaluated states
		server_payload = {
			'runId': run_id,
			'taskId': task.task_id,
			'task': task.confirmed_task,
			'actionHistory': [],
			'finalResultResponse': 'None',  # Default if execution doesn't happen or fails early
			'selfReportCompleted': False,
			'selfReportSuccess': None,
			'onlineMind2WebEvaluationJudgement': 'Not Attempted',
			'onlineMind2WebEvaluationError': None,
			'onlineMind2WebEvaluationSuccess': False,
			'onlineMind2WebEvaluationScore': 0.0,
		}

		# Initialize the return value for local processing status
		local_task_status = {'task_id': task.task_id, 'success': False, 'error': None}

		# --- Main Sequential Logic with Error Handling ---
		try:
			# 1. Check for Existing Result
			if result_file.exists():
				logger.info(f'Task {task.task_id}: Found existing result file.')
				try:
					with anyio.open_file(result_file) as f:
						existing_result = json.loads(await f.read())

					# Populate payload from existing file
					server_payload['actionHistory'] = existing_result.get('action_history', [])
					server_payload['finalResultResponse'] = existing_result.get('final_result_response', 'None')
					server_payload['selfReportCompleted'] = existing_result.get('self_report_completed', False)
					server_payload['selfReportSuccess'] = existing_result.get('self_report_success', None)

					# Check if evaluation data is also present
					if existing_eval := existing_result.get('Online_Mind2Web_evaluation'):
						logger.info(f'Task {task.task_id}: Found existing evaluation data.')
						# Ensure judgement is stored as string "None" if it was null/None in cache
						cached_judgement = existing_eval.get('judgement')
						server_payload['onlineMind2WebEvaluationJudgement'] = (
							cached_judgement if cached_judgement is not None else 'None'
						)
						server_payload['onlineMind2WebEvaluationError'] = existing_eval.get('error')
						server_payload['onlineMind2WebEvaluationSuccess'] = existing_eval.get('success', False)
						server_payload['onlineMind2WebEvaluationScore'] = existing_eval.get('score', 0.0)
						evaluation_needed = False  # Don't re-evaluate if already present
						evaluation_succeeded = True  # Assume cached evaluation was successful
					else:
						# Evaluation not found, needs to run
						evaluation_needed = True
						evaluation_succeeded = False  # Mark as needing evaluation initially

					execution_needed = False  # Don't execute if result exists
					execution_succeeded = True  # Mark as "success" in terms of having data
					logger.info(f'Task {task.task_id}: Successfully loaded existing result. Skipping execution.')

				except Exception as e:
					logger.warning(
						f'Task {task.task_id}: Error reading existing result file {result_file}: {type(e).__name__}: {str(e)}. Proceeding with execution.'
					)
					# Keep execution_needed = True, payload defaults remain
					execution_needed = True
					execution_succeeded = False
					evaluation_needed = True  # Might need eval after execution
					evaluation_succeeded = False  # Reset eval status

			# 2. Execute Task (if needed)
			if execution_needed:
				logger.info(f'Task {task.task_id}: Starting execution.')
				browser = None  # Ensure browser is defined for finally block
				try:
					browserConfig = BrowserConfig(headless=headless)
					browser = Browser(config=browserConfig)
					# Pass the llm to run_agent_with_tracing
					result = await run_agent_with_tracing(
						task=task,
						llm=llm,
						browser=browser,
						max_steps=max_steps_per_task,
						use_vision=use_vision,
						run_id=run_id,  # run_agent_with_tracing handles saving result.json
					)
					logger.info(f'Task {task.task_id}: Execution completed.')
					execution_succeeded = True
					evaluation_needed = True  # Need to evaluate the new result
					evaluation_succeeded = False  # Reset eval status

					# Load the result file that should have just been created
					if result_file.exists():
						async with await anyio.open_file(result_file) as f:
							run_result_data = json.loads(await f.read())
						server_payload['actionHistory'] = run_result_data.get('action_history', [])
						server_payload['finalResultResponse'] = run_result_data.get('final_result_response', 'None')
						server_payload['selfReportCompleted'] = run_result_data.get('self_report_completed', False)
						server_payload['selfReportSuccess'] = run_result_data.get('self_report_success', None)
					else:
						# This is unexpected if run_agent_with_tracing succeeded
						logger.error(
							f'Task {task.task_id}: Result file {result_file} missing after presumed successful execution.'
						)
						raise FileNotFoundError(f'Result file not found after execution for task {task.task_id}')

				except Exception as e:
					logger.error(
						f'Task {task.task_id}: Error during execution with Type: {type(e).__name__} and Message: {str(e)}',
						exc_info=True,
					)  # Add stack trace
					execution_succeeded = False
					evaluation_needed = False  # Cannot evaluate if execution failed
					evaluation_succeeded = False  # Evaluation definitely didn't succeed
					# Update payload to reflect execution failure
					server_payload['finalResultResponse'] = f'Execution Error: {type(e).__name__}: {str(e)}'
					server_payload['onlineMind2WebEvaluationJudgement'] = 'Execution Failed'
					server_payload['onlineMind2WebEvaluationError'] = f'Execution Error: {type(e).__name__}'
				finally:
					if browser:
						try:
							await browser.close()
						except Exception as browser_close_e:
							logger.warning(
								f'Task {task.task_id}: Error closing browser: {type(browser_close_e).__name__}: {browser_close_e}'
							)

			# 3. Evaluate Task (if needed and possible)
			if evaluation_needed and execution_succeeded:
				logger.info(f'Task {task.task_id}: Starting evaluation.')
				try:
					# judge_task_result will attempt evaluation and save it back into result.json if successful
					evaluation = await judge_task_result(eval_model, task_folder, score_threshold=3)

					# Update payload directly from the evaluation function's return value
					if evaluation:
						# Ensure judgement is stored as string "None" if the evaluation returned None
						judgement_value = evaluation.get('judgement')
						server_payload['onlineMind2WebEvaluationJudgement'] = (
							judgement_value if judgement_value is not None else 'None'
						)
						server_payload['onlineMind2WebEvaluationError'] = evaluation.get('error')
						server_payload['onlineMind2WebEvaluationSuccess'] = evaluation.get('success', False)
						server_payload['onlineMind2WebEvaluationScore'] = evaluation.get('score', 0.0)
						# Mark evaluation as succeeded only if the evaluation itself didn't report an error
						if evaluation.get('error'):
							logger.warning(
								f'Task {task.task_id}: Evaluation completed but reported an error: {evaluation.get("error")}'
							)
							evaluation_succeeded = False
						else:
							evaluation_succeeded = True  # Mark evaluation as successfully completed
							logger.info(f'Task {task.task_id}: Evaluation successfully completed.')

					else:
						# Should not happen based on judge_task_result structure, but handle defensively
						logger.error(f'Task {task.task_id}: Evaluation function returned None.')
						evaluation_succeeded = False  # Mark as failed if None returned
						server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Returned None'
						server_payload['onlineMind2WebEvaluationError'] = 'Evaluation function returned None'

				except Exception as e:
					logger.error(
						f'Task {task.task_id}: Error during evaluation process: {type(e).__name__}: {str(e)}', exc_info=True
					)  # Add stack trace
					evaluation_succeeded = False
					# Update payload to reflect evaluation failure
					server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Process Error'
					server_payload['onlineMind2WebEvaluationError'] = f'Evaluation Error: {type(e).__name__}: {str(e)}'
					# Keep Success/Score as False/0.0 from defaults

		except Exception as outer_e:
			# Catch any unexpected errors in the flow above (e.g., reading existing file, setup issues)
			logger.critical(f'Task {task.task_id}: CRITICAL UNHANDLED ERROR during processing: {str(outer_e)}', exc_info=True)
			local_processing_error = f'Critical flow error: {str(outer_e)}'
			# Ensure payload reflects a critical failure state
			server_payload['finalResultResponse'] = f'Critical Error: {str(outer_e)}'
			server_payload['onlineMind2WebEvaluationJudgement'] = 'Critical System Error'
			server_payload['onlineMind2WebEvaluationError'] = local_processing_error
			server_payload['onlineMind2WebEvaluationSuccess'] = False
			server_payload['onlineMind2WebEvaluationScore'] = 0.0
			execution_succeeded = False  # Mark stages as failed due to outer error
			evaluation_succeeded = False

		# --- Final Step: Save to Server (Always Attempt) ---
		logger.info(f'Task {task.task_id}: Attempting to save final result to server...')
		try:
			save_success = save_task_result_to_server(convex_url, secret_key, server_payload)
			if save_success:
				logger.info(f'Task {task.task_id}: Successfully saved result to server.')
			else:
				logger.warning(f'Task {task.task_id}: Failed to save result to server (API issue or invalid payload).')
				# Optionally accumulate this failure into local_processing_error
				if local_processing_error:
					local_processing_error += '; Server save failed'
				else:
					local_processing_error = 'Server save failed'

		except Exception as e:
			logger.error(f'Task {task.task_id}: Exception during attempt to save result to server: {type(e).__name__}: {str(e)}')
			# Optionally accumulate this failure
			if local_processing_error:
				local_processing_error += f'; Server save exception: {str(e)}'
			else:
				local_processing_error = f'Server save exception: {str(e)}'

		# --- Return Local Processing Status ---
		# Overall success requires successful execution (or loading existing) AND successful evaluation (if needed).
		local_task_status['success'] = execution_succeeded and evaluation_succeeded
		local_task_status['error'] = local_processing_error  # Report any accumulated local errors

		return local_task_status


async def run_multiple_tasks(
	tasks: list[Task],
	llm: BaseChatModel,
	run_id: str,
	convex_url: str,
	secret_key: str,
	eval_model: BaseChatModel,
	max_parallel_runs: int = 3,
	max_parallel_evaluations: int = 5,
	max_steps_per_task: int = 25,
	start_index: int = 0,
	end_index: int | None = None,
	headless: bool = False,
	use_vision: bool = True,
	fresh_start: bool = True,
) -> dict:
	"""
	Run multiple tasks in parallel and evaluate results.
	"""
	semaphore_runs = asyncio.Semaphore(max_parallel_runs)
	tasks_to_run = tasks[start_index:end_index] if end_index else tasks[start_index:]

	# Run all tasks in parallel with additional parameters
	task_results = await asyncio.gather(
		*(
			run_task_with_semaphore(
				task=task,
				run_id=run_id,
				convex_url=convex_url,
				secret_key=secret_key,
				eval_model=eval_model,
				llm=llm,  # Pass the agent LLM
				max_steps_per_task=max_steps_per_task,
				headless=headless,
				use_vision=use_vision,
				semaphore_runs=semaphore_runs,  # Pass the semaphore
			)
			for task in tasks_to_run
		)
	)

	# After all tasks are complete, calculate a local summary
	logger.info('All tasks completed. Calculating result summary...')
	summary = calculate_local_summary()

	# Log the summary statistics
	logger.info(f'Completed {summary["total_tasks"]} tasks')
	logger.info(f'Success rate: {summary["success_rate"]:.2%}')
	logger.info(f'Average score: {summary["average_score"]:.2f}')

	return {'task_results': task_results, 'summary': summary}


# Helper function to fetch tasks from the server
def fetch_tasks_from_server(convex_url: str, secret_key: str, test_case_name: str):
	"""Fetches the specified test case file from the Convex HTTP endpoint."""

	if not convex_url:
		logger.error('Error: EVALUATION_TOOL_URL environment variable not set.')
		return None

	if not secret_key:
		logger.error('Error: EVALUATION_TOOL_SECRET_KEY environment variable not set.')
		return None

	endpoint_url = f'{convex_url}/api/getTestCase'
	headers = {
		'Authorization': f'Bearer {secret_key}',
		'Content-Type': 'application/json',
	}
	payload = {'name': test_case_name}

	logger.info(f"Fetching test case '{test_case_name}' from {endpoint_url}...")

	try:
		response = requests.post(endpoint_url, headers=headers, json=payload)

		logger.info(f'Fetch Status Code: {response.status_code}')

		if response.status_code == 200:
			try:
				data = response.json()
				logger.info(f"Successfully fetched test case data for '{test_case_name}'.")
				# Assuming the data is the list of tasks
				if isinstance(data, list):
					return data
				else:
					logger.error(f'Error: Fetched data is not a list. Type: {type(data)}')
					logger.error(f'Raw response: {response.text}')
					return None

			except json.JSONDecodeError:
				logger.error('Error: Failed to decode JSON response.')
				logger.error(f'Raw response text: {response.text}')
				return None
		else:
			logger.error(f"Error: Failed to fetch test case '{test_case_name}'. Status: {response.status_code}")
			logger.error(f'Response: {response.text}')
			return None

	except requests.exceptions.RequestException as e:
		logger.error(f'Error during request to fetch test case: {type(e).__name__}: {e}')
		return None


# Helper function to get git information
def get_git_info():
	"""Retrieves git branch, commit hash, and commit timestamp using subprocess."""
	try:
		branch = subprocess.run(
			['git', 'rev-parse', '--abbrev-ref', 'HEAD'], capture_output=True, text=True, check=True
		).stdout.strip()
		commit_hash = subprocess.run(['git', 'rev-parse', 'HEAD'], capture_output=True, text=True, check=True).stdout.strip()
		# Get commit timestamp as Unix epoch integer
		commit_timestamp_str = subprocess.run(
			['git', 'log', '-1', '--format=%ct'], capture_output=True, text=True, check=True
		).stdout.strip()
		commit_timestamp = int(commit_timestamp_str)
		return {'branch': branch, 'hash': commit_hash, 'timestamp': commit_timestamp}
	except (subprocess.CalledProcessError, FileNotFoundError, ValueError) as e:
		logger.warning(f'Could not retrieve git info: {type(e).__name__}: {e}. Using defaults.')
		return {
			'branch': 'unknown',
			'hash': 'unknown',
			'timestamp': int(time.time()),  # Fallback to current time
		}


# Helper function to start a new run on the server
def start_new_run(convex_url: str, secret_key: str, run_details: dict):
	"""Sends a request to start a new evaluation run and returns the run ID."""
	if not convex_url or not secret_key:
		logger.error('Error: Convex URL or Secret Key not provided for starting run.')
		return None

	endpoint_url = f'{convex_url}/api/startRun'
	headers = {
		'Authorization': f'Bearer {secret_key}',
		'Content-Type': 'application/json',
	}

	logger.info(f'Sending request to start run at {endpoint_url}...')
	# Avoid logging secret key in run_details if it were ever passed
	loggable_details = {k: v for k, v in run_details.items() if k != 'secret_key'}
	logger.info(f'Run details: {json.dumps(loggable_details, indent=2)}')

	try:
		response = requests.post(endpoint_url, headers=headers, json=run_details)
		logger.info(f'Start Run Status Code: {response.status_code}')

		if response.status_code == 200:
			try:
				data = response.json()
				run_id = data.get('runId')
				if run_id:
					logger.info(f'Successfully started run. Run ID: {run_id}')
					return run_id
				else:
					logger.error("Error: 'runId' not found in successful startRun response.")
					logger.error(f'Raw response: {response.text}')
					return None
			except json.JSONDecodeError:
				logger.error('Error: Failed to decode startRun JSON response.')
				logger.error(f'Raw response text: {response.text}')
				return None
		else:
			logger.error('Error: Failed to start run.')
			logger.error(f'Response: {response.text}')
			return None

	except requests.exceptions.RequestException as e:
		logger.error(f'Error during startRun request: {type(e).__name__}: {e}')
		return None


# Helper function to save a task result to the server
def save_task_result_to_server(convex_url: str, secret_key: str, result_details: dict):
	"""Sends a request to save a single task result to the Convex backend."""

	if not convex_url:
		logger.error('Error: EVALUATION_TOOL_URL environment variable not set for saving task result.')
		return False

	if not secret_key:
		logger.error('Error: EVALUATION_TOOL_SECRET_KEY environment variable not set for saving task result.')
		return False

	# Ensure runId is present in the details being sent
	if 'runId' not in result_details or not result_details['runId']:
		logger.error("Error: 'runId' is missing or empty in result_details for saveTaskResult.")
		return False

	endpoint_url = f'{convex_url}/api/saveTaskResult'
	headers = {
		'Authorization': f'Bearer {secret_key}',
		'Content-Type': 'application/json',
	}

	logger.info(f'Sending request to save task result at {endpoint_url}...')
	logger.debug(f'Result details payload: {json.dumps(result_details, indent=2)}')  # Log details at debug level

	try:
		response = requests.post(endpoint_url, headers=headers, json=result_details)

		logger.info(f'Save Task Result Status Code: {response.status_code}')

		if response.status_code == 200:
			try:
				data = response.json()
				logger.info(f'Successfully saved task result: {data.get("message")}')
				logger.info(f'Result ID: {data.get("resultId")}')
				return True
			except json.JSONDecodeError:
				logger.error('Error: Failed to decode saveTaskResult JSON response.')
				logger.error(f'Raw response text: {response.text}')
				return False
		else:
			logger.error('Error: Failed to save task result.')
			logger.error(f'Response: {response.text}')
			return False

	except requests.exceptions.RequestException as e:
		logger.error(f'Error during saveTaskResult request: {type(e).__name__}: {e}')
		return False


if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Run and evaluate browser automation tasks')
	parser.add_argument('--parallel_runs', type=int, default=3, help='Number of parallel tasks to run')
	parser.add_argument('--parallel_evaluations', type=int, default=5, help='Number of parallel evaluations to run')
	parser.add_argument('--max-steps', type=int, default=25, help='Maximum steps per task')
	parser.add_argument('--start', type=int, default=0, help='Start index')
	parser.add_argument('--end', type=int, default=None, help='End index (exclusive)')
	parser.add_argument('--headless', action='store_true', help='Run in headless mode')
	parser.add_argument('--evaluate-only', action='store_true', help='Only evaluate existing results without running new tasks')
	parser.add_argument(
		'--model', type=str, default='gpt-4o', choices=list(SUPPORTED_MODELS.keys()), help='Model to use for the agent'
	)
	parser.add_argument('--no-vision', action='store_true', help='Disable vision capabilities in the agent')
	parser.add_argument(
		'--fresh-start',
		type=lambda x: (str(x).lower() == 'true'),
		default=True,
		help='Clear saved_trajectories before starting. Set to False to keep existing trajectories (default: True)',
	)
	parser.add_argument('--user-message', type=str, default='', help='User message to include in the run')
	args = parser.parse_args()

	# Set up logging - Make sure logger is configured before use in fetch function
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)  # Define logger for the module

	if args.evaluate_only:
		# Just evaluate existing results
		logger.info('Evaluating existing results...')
		summary = calculate_local_summary()

		# Save evaluation results
		timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
		eval_file = f'saved_trajectories/evaluation_summary_{timestamp}.json'
		with open(eval_file, 'w') as f:
			json.dump(summary, f, indent=2)

		logger.info(f'Evaluation complete. Success rate: {summary["success_rate"]:.2%}')
		logger.info(f'Average score: {summary["average_score"]:.2f}')
		logger.info(f'Full results saved to {eval_file}')

	else:
		logger.info('Running tasks...')
		# Run tasks and evaluate
		load_dotenv()

		# --- Clear trajectories if fresh_start is True ---
		results_dir_path = Path('saved_trajectories')
		if args.fresh_start:
			logger.info(f'--fresh-start is True. Clearing {results_dir_path}...')
			if results_dir_path.exists():
				try:
					shutil.rmtree(results_dir_path)
					logger.info(f'Successfully removed {results_dir_path}.')
				except OSError as e:
					logger.error(f'Error removing directory {results_dir_path}: {type(e).__name__}: {e}')
					# Decide if you want to exit or continue
					# exit(1) # Uncomment to exit on error
			else:
				logger.info(f'{results_dir_path} does not exist, no need to clear.')

			# Recreate the directory
			try:
				results_dir_path.mkdir(parents=True, exist_ok=True)
				logger.info(f'Recreated directory {results_dir_path}.')
			except OSError as e:
				logger.error(f'Error creating directory {results_dir_path}: {type(e).__name__}: {e}')
				# exit(1) # Uncomment to exit on error
		else:
			logger.info('--fresh-start is False. Existing trajectories in saved_trajectories will be kept.')
		# -------------------------------------------------

		# --- Fetch Tasks from Server ---
		CONVEX_URL = os.getenv('EVALUATION_TOOL_URL')
		SECRET_KEY = os.getenv('EVALUATION_TOOL_SECRET_KEY')
		TEST_CASE_NAME = 'OnlineMind2Web'  # Name of the test case to fetch

		if not CONVEX_URL or not SECRET_KEY:
			logger.error('Error: EVALUATION_TOOL_URL or EVALUATION_TOOL_SECRET_KEY environment variables not set.')
			exit(1)  # Exit if config is missing

		logger.info(f"Attempting to fetch task list '{TEST_CASE_NAME}' from server...")
		fetched_task_data = fetch_tasks_from_server(CONVEX_URL, SECRET_KEY, TEST_CASE_NAME)

		if fetched_task_data is None:
			logger.error('Failed to fetch tasks from the server. Exiting.')
			exit(1)  # Exit if fetch fails

		try:
			tasks = [Task(**task_data) for task_data in fetched_task_data]
			logger.info(f'Successfully loaded {len(tasks)} tasks from the server.')
		except TypeError as e:
			logger.error(
				f'Error creating Task objects from fetched data. Ensure the data structure matches Task requirements (task_id, confirmed_task, etc.). Error: {type(e).__name__}: {e}'
			)
			logger.error(f'First item in fetched data: {fetched_task_data[0] if fetched_task_data else "None"}')
			exit(1)
		# -----------------------------

		# --- Start Run on Server ---
		logger.info('Attempting to start a new run on the server...')
		git_info = get_git_info()

		# Collect additional data from args to store with the run
		additional_run_data = {
			'max_steps': args.max_steps,
			'parallel_runs': args.parallel_runs,
			'parallel_evaluations': args.parallel_evaluations,
			'start_index': args.start,
			'end_index': args.end,
			'headless': args.headless,
			'use_vision': not args.no_vision,
			'task_source': TEST_CASE_NAME,
		}

		run_data = {
			'model': args.model,
			'gitBranch': git_info['branch'],
			'gitCommitHash': git_info['hash'],
			'gitCommitTimestamp': git_info['timestamp'],
			'userMessage': args.user_message,
			'totalTasks': args.end - args.start,
			'additionalData': additional_run_data,
		}

		run_id = start_new_run(CONVEX_URL, SECRET_KEY, run_data)

		if not run_id:
			logger.error('Failed to start a new run on the server. Exiting.')
			exit(1)

		logger.info(f'Successfully obtained run ID: {run_id}. Proceeding with tasks...')
		# -------------------------

		# Get the selected LLM
		llm = get_llm(args.model)

		results = asyncio.run(
			run_multiple_tasks(
				tasks=tasks,
				llm=llm,  # Pass the instantiated llm
				run_id=run_id,
				convex_url=CONVEX_URL,
				secret_key=SECRET_KEY,
				eval_model=llm,
				max_parallel_runs=args.parallel_runs,
				max_parallel_evaluations=args.parallel_evaluations,
				max_steps_per_task=args.max_steps,
				start_index=args.start,
				end_index=args.end,
				headless=args.headless,
				use_vision=not args.no_vision,
				fresh_start=args.fresh_start,
			)
		)

		logger.info('Task completed. Saving results...')
		# Save results
		timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
		results_file = f'saved_trajectories/eval_results_{timestamp}.json'

		# Convert results to JSON-serializable format
		serializable_results = {'summary': results['summary']}

		with open(results_file, 'w') as f:
			json.dump(serializable_results, f, indent=2)

		# Print summary
		summary = results['summary']
		logger.info(f'Completed {summary["total_tasks"]} tasks.')
		logger.info(f'Success rate: {summary["success_rate"]:.2%}')
		logger.info(f'Average score: {summary["average_score"]:.2f}')
		logger.info(f'Results saved to {results_file}')