refactor run task function and exception handling to be more elegant, and add fix for gpt-o4-mini

This commit is contained in:
Alezander9
2025-05-22 16:42:28 -07:00
parent b8c453ee47
commit c2d1ec50dc
2 changed files with 1928 additions and 320 deletions

1546
eval/service copy.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -446,6 +446,9 @@ def get_llm(model_name: str):
return ChatGoogleGenerativeAI(**kwargs)
case 'openai_compatible':
kwargs = {'model': config['model_name'], 'base_url': config['base_url'], 'temperature': 0.0}
# Must set temperatue=1.0 if model is gpt-o4-mini
if model_name == 'gpt-o4-mini':
kwargs['temperature'] = 1.0
if api_key_secret:
kwargs['api_key'] = api_key_secret
elif config.get('base_url'):
@@ -727,39 +730,42 @@ def calculate_local_summary(results_dir: str | None = None) -> dict:
}
async def run_task_with_semaphore(
task: Task,
run_id: str,
convex_url: str,
secret_key: str,
eval_model: BaseChatModel,
llm: BaseChatModel,
max_steps_per_task: int,
headless: bool,
use_vision: bool,
semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument
fresh_start: bool = True,
) -> dict:
"""Run a single task with semaphore, sequential execution, and robust error handling"""
# Acquire semaphore before starting any task-specific logic
async with semaphore_runs:
# --- Initialize State & Payload ---
task_folder = Path(f'saved_trajectories/{task.task_id}')
result_file = task_folder / 'result.json' # Now points to the file created by reformat_agent_history
from dataclasses import dataclass
from enum import Enum
from typing import Any
# Flags to track progress and errors
execution_needed = True
execution_succeeded = False
evaluation_needed = True
evaluation_succeeded = True # Default to True, set to False if eval is needed but fails
local_processing_error = None
# Initialize the payload with basic info and default failure/unevaluated states
# Using server-expected keys now
server_payload = {
class Stage(Enum):
LOAD_EXISTING = 'load_existing'
SETUP_BROWSER = 'setup_browser'
RUN_AGENT = 'run_agent'
FORMAT_HISTORY = 'format_history'
EVALUATE = 'evaluate'
SAVE_SERVER = 'save_server'
@dataclass
class StageError(Exception):
stage: Stage
error_type: str # "timeout", "cancelled", "exception"
message: str
class TaskResult:
"""Simplified task state tracker with auto-updating server payload"""
def __init__(self, task_id: str, run_id: str, task_description: str, task: Task, max_steps: int):
self.task_id = task_id
self.completed_stages = set()
self.stage_data = {} # Store actual results from each stage
self.failed_stages = {} # Store errors from failed stages
self.local_error = None
# Initialize server payload with defaults
self.server_payload = {
'runId': run_id,
'taskId': task.task_id,
'task': task.confirmed_task,
'taskId': task_id,
'task': task_description,
'taskWebsite': task.website,
'taskReferenceLength': task.reference_length,
'taskLevel': task.level,
@@ -773,312 +779,358 @@ async def run_task_with_semaphore(
'onlineMind2WebEvaluationError': None,
'onlineMind2WebEvaluationSuccess': False,
'onlineMind2WebEvaluationScore': 0.0,
'completeHistory': [], # Initialize new field
'maxSteps': max_steps_per_task,
'completeHistory': [],
'maxSteps': max_steps,
'tokensUsed': 0,
'taskDuration': None,
'steps': 0,
}
# Initialize the return value for local processing status
local_task_status = {'task_id': task.task_id, 'success': False, 'error': None}
def stage_completed(self, stage: Stage, data: Any = None):
"""Mark stage as completed and update server payload"""
self.completed_stages.add(stage)
if data is not None:
self.stage_data[stage] = data
self._auto_update_payload()
# --- Main Sequential Logic with Error Handling ---
try:
# 1. Check for Existing Result
if result_file.exists():
logger.info(f'Task {task.task_id}: Found existing result file.')
try:
async with await anyio.open_file(result_file) as f:
existing_result = json.loads(await f.read())
def stage_failed(self, stage: Stage, error: StageError):
"""Mark stage as failed and update server payload"""
self.failed_stages[stage] = error
self._auto_update_payload()
# Populate payload from existing file (including new fields)
server_payload['actionHistory'] = existing_result.get('action_history', [])
server_payload['finalResultResponse'] = existing_result.get('final_result_response', 'None')
server_payload['selfReportCompleted'] = existing_result.get('self_report_completed', False)
server_payload['selfReportSuccess'] = existing_result.get('self_report_success', None)
server_payload['completeHistory'] = existing_result.get('complete_history', [])
server_payload['taskDuration'] = existing_result.get('task_duration')
server_payload['steps'] = existing_result.get('steps', 0)
server_payload['tokensUsed'] = existing_result.get('tokensUsed', 0) # Ensure tokensUsed is loaded
def has_execution_data(self) -> bool:
"""Check if we have execution data from either loading existing or completing execution"""
return Stage.LOAD_EXISTING in self.completed_stages or Stage.FORMAT_HISTORY in self.completed_stages
# Check if evaluation data is also present
if existing_eval := existing_result.get('Online_Mind2Web_evaluation'):
logger.info(f'Task {task.task_id}: Found existing evaluation data.')
# Ensure judgement is stored as string "None" if it was null/None in cache
cached_judgement = existing_eval.get('judgement')
server_payload['onlineMind2WebEvaluationJudgement'] = (
cached_judgement if cached_judgement is not None else 'None'
)
server_payload['onlineMind2WebEvaluationError'] = existing_eval.get('error')
server_payload['onlineMind2WebEvaluationSuccess'] = existing_eval.get('success', False)
server_payload['onlineMind2WebEvaluationScore'] = existing_eval.get('score', 0.0)
evaluation_needed = False # Don't re-evaluate if already present
evaluation_succeeded = True # Assume cached evaluation was successful
else:
evaluation_needed = True
evaluation_succeeded = False
def execution_succeeded(self) -> bool:
"""Check if execution pipeline succeeded"""
return (
Stage.LOAD_EXISTING in self.completed_stages or Stage.FORMAT_HISTORY in self.completed_stages
) and not self._has_execution_failures()
execution_needed = False
execution_succeeded = True
logger.info(f'Task {task.task_id}: Successfully loaded existing result. Skipping execution.')
def _has_execution_failures(self) -> bool:
"""Check if any execution-related stages failed"""
execution_stages = {Stage.SETUP_BROWSER, Stage.RUN_AGENT, Stage.FORMAT_HISTORY}
return any(stage in self.failed_stages for stage in execution_stages)
except Exception as e:
logger.warning(
f'Task {task.task_id}: Error reading existing result file {result_file}: {type(e).__name__}: {str(e)}. Proceeding with execution.'
)
execution_needed = True
execution_succeeded = False
evaluation_needed = True
evaluation_succeeded = False
def _auto_update_payload(self):
"""Automatically update server_payload based on current state"""
# Update execution data if available
if Stage.LOAD_EXISTING in self.completed_stages:
existing_data = self.stage_data[Stage.LOAD_EXISTING]
self.server_payload.update(
{
'actionHistory': existing_data.get('action_history', []),
'finalResultResponse': existing_data.get('final_result_response', 'None'),
'selfReportCompleted': existing_data.get('self_report_completed', False),
'selfReportSuccess': existing_data.get('self_report_success', None),
'completeHistory': existing_data.get('complete_history', []),
'taskDuration': existing_data.get('task_duration'),
'steps': existing_data.get('steps', 0),
'tokensUsed': existing_data.get('tokensUsed', 0),
}
)
elif Stage.FORMAT_HISTORY in self.completed_stages:
formatted_data = self.stage_data[Stage.FORMAT_HISTORY]
self.server_payload.update(
{
'actionHistory': formatted_data.get('action_history', []),
'finalResultResponse': formatted_data.get('final_result_response', 'None'),
'selfReportCompleted': formatted_data.get('self_report_completed', False),
'selfReportSuccess': formatted_data.get('self_report_success', None),
'completeHistory': formatted_data.get('complete_history', []),
'taskDuration': formatted_data.get('task_duration'),
'steps': formatted_data.get('steps', 0),
'tokensUsed': formatted_data.get('tokensUsed', 0),
}
)
# 2. Execute Task (if needed)
if execution_needed:
logger.info(f'Task {task.task_id}: Starting execution.')
agent_for_history = None # For safe access in except/finally
browser_session_for_cleanup = None # For safe access in finally
operation_timed_out = None # To specify which operation timed out
# Update evaluation data if available
if Stage.EVALUATE in self.completed_stages:
eval_data = self.stage_data[Stage.EVALUATE]
judgement = eval_data.get('judgement')
self.server_payload.update(
{
'onlineMind2WebEvaluationJudgement': judgement if judgement is not None else 'None',
'onlineMind2WebEvaluationError': eval_data.get('error'),
'onlineMind2WebEvaluationSuccess': eval_data.get('success', False),
'onlineMind2WebEvaluationScore': eval_data.get('score', 0.0),
}
)
try:
# Create a unique user_data_dir for each task
# Get parent like C:\\\\Users\\\\alexa\\\\.config\\\\browseruse\\\\profiles
base_user_data_dir = Path(BrowserProfile().user_data_dir).parent
unique_user_data_dir = base_user_data_dir / f'task_{task.task_id}'
unique_user_data_dir.mkdir(parents=True, exist_ok=True) # Ensure it exists
# Update failure states
self._update_failure_states()
browser_session = BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=str(unique_user_data_dir), # Pass the unique path
headless=headless,
chromium_sandbox=False, # This is needed for the browser to run on GitHub Actions
),
)
browser_session_for_cleanup = browser_session
try:
logger.info(f'Task {task.task_id}: Starting browser session (timeout 120s)...')
await asyncio.wait_for(browser_session.start(), timeout=120)
logger.info(f'Task {task.task_id}: Browser session started.')
except TimeoutError as e:
operation_timed_out = 'browser_session.start()'
raise e # Re-raise to be caught by the outer TimeoutError handler
initial_actions = [{'go_to_url': {'url': task.website}}]
agent = Agent(
task=task.confirmed_task,
llm=llm,
browser_session=browser_session,
initial_actions=initial_actions,
use_vision=use_vision,
source='eval_platform',
)
agent_for_history = agent
try:
logger.info(f'Task {task.task_id}: Starting agent run (timeout 600s)...')
await asyncio.wait_for(
agent.run(max_steps=max_steps_per_task),
timeout=600,
)
logger.info(f'Task {task.task_id}: Agent run completed.')
except TimeoutError as e:
operation_timed_out = 'agent.run()'
raise e # Re-raise to be caught by the outer TimeoutError handler
# Reformat agent history to create result.json
run_result_data = await reformat_agent_history(
agent_history=agent_for_history.state.history,
task_id=task.task_id,
run_id=run_id,
task=task.confirmed_task,
)
if not run_result_data:
# This shouldn't happen if reformat succeeded, but handle defensively
logger.error(f'Task {task.task_id}: reformat_agent_history did not return results.')
raise ValueError('Result formatting failed')
execution_succeeded = True
evaluation_needed = True
evaluation_succeeded = False # Will be set to True if evaluation runs and succeeds
# Populate payload from the newly created results
server_payload['actionHistory'] = run_result_data.get('action_history', [])
server_payload['finalResultResponse'] = run_result_data.get('final_result_response', 'None')
server_payload['selfReportCompleted'] = run_result_data.get('self_report_completed', False)
server_payload['selfReportSuccess'] = run_result_data.get('self_report_success', None)
server_payload['completeHistory'] = run_result_data.get('complete_history', [])
server_payload['taskDuration'] = run_result_data.get('task_duration')
server_payload['steps'] = run_result_data.get('steps', 0)
server_payload['tokensUsed'] = run_result_data.get('tokensUsed', 0)
except TimeoutError as e:
timeout_location_msg = f'Operation "{operation_timed_out}"' if operation_timed_out else 'An operation'
logger.error(
f'Task {task.task_id}: Timeout during execution. {timeout_location_msg} timed out. Error: {str(e)}',
exc_info=True,
)
execution_succeeded = False
evaluation_needed = False
evaluation_succeeded = False
server_payload['browserCrash'] = True
server_payload['browserCrashReason'] = (
f'Execution Timeout: {timeout_location_msg} timed out. Error: {type(e).__name__}: {str(e)}'
)
logger.info('added browser crash reason due to timeout: ' + server_payload['browserCrashReason'])
if agent_for_history and agent_for_history.state.history:
try:
logger.info(f'Task {task.task_id}: Attempting to reformat partial history after timeout.')
run_result_data = await reformat_agent_history(
agent_history=agent_for_history.state.history,
task_id=task.task_id,
run_id=run_id,
task=task.confirmed_task,
)
if run_result_data:
server_payload['actionHistory'] = run_result_data.get('action_history', [])
server_payload['finalResultResponse'] = run_result_data.get('final_result_response', 'None')
server_payload['selfReportCompleted'] = run_result_data.get('self_report_completed', False)
server_payload['selfReportSuccess'] = run_result_data.get('self_report_success', None)
server_payload['completeHistory'] = run_result_data.get('complete_history', [])
server_payload['taskDuration'] = run_result_data.get('task_duration')
server_payload['steps'] = run_result_data.get('steps', 0)
server_payload['tokensUsed'] = run_result_data.get('tokensUsed', 0)
except Exception as hist_e:
logger.error(
f'Task {task.task_id}: Error reformatting agent history after timeout: {type(hist_e).__name__}: {str(hist_e)}'
)
server_payload['onlineMind2WebEvaluationJudgement'] = 'Execution Timed Out'
server_payload['onlineMind2WebEvaluationSuccess'] = False
server_payload['onlineMind2WebEvaluationScore'] = 0.0
except Exception as e:
logger.error(
f'Task {task.task_id}: Browser Error during execution/reformatting with Type: {type(e).__name__} and Message: {str(e)}',
exc_info=True,
)
execution_succeeded = False
evaluation_needed = False
evaluation_succeeded = False
# Update payload to reflect execution failure
server_payload['browserCrash'] = True
server_payload['browserCrashReason'] = f'Execution Error: {type(e).__name__}: {str(e)}'
logger.info('added browser crash reason: ' + server_payload['browserCrashReason'])
# Try very carefully to add partial results if available
if agent_for_history and agent_for_history.state.history:
try:
logger.info(f'Task {task.task_id}: Attempting to reformat partial history after general error.')
run_result_data = await reformat_agent_history(
agent_history=agent_for_history.state.history,
task_id=task.task_id,
run_id=run_id,
task=task.confirmed_task,
)
if run_result_data:
server_payload['actionHistory'] = run_result_data.get('action_history', [])
server_payload['finalResultResponse'] = run_result_data.get('final_result_response', 'None')
server_payload['selfReportCompleted'] = run_result_data.get('self_report_completed', False)
server_payload['selfReportSuccess'] = run_result_data.get('self_report_success', None)
server_payload['completeHistory'] = run_result_data.get('complete_history', [])
server_payload['taskDuration'] = run_result_data.get('task_duration')
server_payload['steps'] = run_result_data.get('steps', 0)
server_payload['tokensUsed'] = run_result_data.get('tokensUsed', 0)
except Exception as hist_e:
logger.error(
f'Task {task.task_id}: Error reformatting agent history after general error: {type(hist_e).__name__}: {str(hist_e)}'
)
# Automatically set Online_Mind2Web_evaluation to failed
server_payload['onlineMind2WebEvaluationJudgement'] = 'Browser Execution Failed'
server_payload['onlineMind2WebEvaluationSuccess'] = False
server_payload['onlineMind2WebEvaluationScore'] = 0.0
finally:
if browser_session_for_cleanup:
try:
logger.info(f'Task {task.task_id}: Closing browser session in finally block.')
await browser_session_for_cleanup.close()
except Exception as browser_close_e:
logger.warning(
f'Task {task.task_id}: Error closing browser: {type(browser_close_e).__name__}: {browser_close_e}'
)
# 3. Evaluate Task (if needed and possible)
if evaluation_needed and execution_succeeded:
logger.info(f'Task {task.task_id}: Starting evaluation.')
try:
# judge_task_result will attempt evaluation and save it back into result.json if successful
evaluation = await judge_task_result(eval_model, task_folder, score_threshold=3)
# Update payload directly from the evaluation function's return value
if evaluation:
judgement_value = evaluation.get('judgement')
server_payload['onlineMind2WebEvaluationJudgement'] = (
judgement_value if judgement_value is not None else 'None'
)
server_payload['onlineMind2WebEvaluationError'] = evaluation.get('error')
server_payload['onlineMind2WebEvaluationSuccess'] = evaluation.get('success', False)
server_payload['onlineMind2WebEvaluationScore'] = evaluation.get('score', 0.0)
if evaluation.get('error'):
logger.warning(
f'Task {task.task_id}: Evaluation completed but reported an error: {evaluation.get("error")}'
)
evaluation_succeeded = False
else:
evaluation_succeeded = True
logger.info(f'Task {task.task_id}: Evaluation successfully completed.')
else:
logger.error(f'Task {task.task_id}: Evaluation function returned None.')
evaluation_succeeded = False
server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Returned None'
server_payload['onlineMind2WebEvaluationError'] = 'Evaluation function returned None'
except Exception as e:
logger.error(
f'Task {task.task_id}: Error during evaluation process: {type(e).__name__}: {str(e)}', exc_info=True
)
evaluation_succeeded = False
server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Process Error'
server_payload['onlineMind2WebEvaluationError'] = f'Evaluation Error: {type(e).__name__}: {str(e)}'
except Exception as outer_e:
logger.critical(f'Task {task.task_id}: CRITICAL UNHANDLED ERROR during processing: {str(outer_e)}', exc_info=True)
local_processing_error = f'Critical flow error: {str(outer_e)}'
server_payload['finalResultResponse'] = f'Critical Error: {str(outer_e)}'
server_payload['onlineMind2WebEvaluationJudgement'] = 'Critical System Error'
server_payload['onlineMind2WebEvaluationError'] = local_processing_error
server_payload['onlineMind2WebEvaluationSuccess'] = False
server_payload['onlineMind2WebEvaluationScore'] = 0.0
execution_succeeded = False
evaluation_succeeded = False
# --- Final Step: Save to Server (Always Attempt) ---
logger.info(f'Task {task.task_id}: Attempting to save final result to server...')
try:
# Pass the fully populated server_payload
save_success = save_task_result_to_server(convex_url, secret_key, server_payload)
if save_success:
logger.info(f'Task {task.task_id}: Successfully saved result to server.')
else:
logger.warning(f'Task {task.task_id}: Failed to save result to server (API issue or invalid payload).')
if local_processing_error:
local_processing_error += '; Server save failed'
def _update_failure_states(self):
"""Update server payload based on failed stages"""
# Check for browser/execution failures
for stage, error in self.failed_stages.items():
if stage in {Stage.SETUP_BROWSER, Stage.RUN_AGENT}:
self.server_payload['browserCrash'] = True
if error.error_type == 'timeout':
self.server_payload['browserCrashReason'] = f'{stage.value} timed out: {error.message}'
elif error.error_type == 'cancelled':
self.server_payload['browserCrashReason'] = f'{stage.value} was cancelled: {error.message}'
else:
local_processing_error = 'Server save failed'
self.server_payload['browserCrashReason'] = f'{stage.value} failed: {error.message}'
# Update evaluation failures
elif stage == Stage.EVALUATE:
if error.error_type == 'timeout':
self.server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Timed Out'
self.server_payload['onlineMind2WebEvaluationError'] = 'Evaluation process timed out'
elif error.error_type == 'cancelled':
self.server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Cancelled'
self.server_payload['onlineMind2WebEvaluationError'] = 'Evaluation was cancelled'
else:
self.server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Process Error'
self.server_payload['onlineMind2WebEvaluationError'] = f'Evaluation Error: {error.message}'
def mark_cancelled(self):
"""Mark task as cancelled"""
self.server_payload.update(
{
'finalResultResponse': 'Task was cancelled',
'onlineMind2WebEvaluationJudgement': 'Task Cancelled',
'onlineMind2WebEvaluationError': 'Task was cancelled',
'onlineMind2WebEvaluationSuccess': False,
'onlineMind2WebEvaluationScore': 0.0,
}
)
self.local_error = 'Task cancelled'
def mark_critical_error(self, error_msg: str):
"""Mark task as having critical error"""
self.server_payload.update(
{
'finalResultResponse': f'Critical Error: {error_msg}',
'onlineMind2WebEvaluationJudgement': 'Critical System Error',
'onlineMind2WebEvaluationError': f'Critical flow error: {error_msg}',
'onlineMind2WebEvaluationSuccess': False,
'onlineMind2WebEvaluationScore': 0.0,
}
)
self.local_error = f'Critical flow error: {error_msg}'
def mark_server_save_failed(self, error_msg: str):
"""Mark server save as failed"""
if self.local_error:
self.local_error += f'; Server save failed: {error_msg}'
else:
self.local_error = f'Server save failed: {error_msg}'
def get_local_status(self) -> dict:
"""Return local processing status"""
success = self.execution_succeeded() and (
Stage.EVALUATE in self.completed_stages or not self.has_execution_data() or Stage.EVALUATE in self.failed_stages
)
return {'task_id': self.task_id, 'success': success and not self.local_error, 'error': self.local_error}
async def run_stage(stage: Stage, stage_func, timeout: int | None = None):
"""Generic stage runner with timeout"""
if timeout:
return await asyncio.wait_for(stage_func(), timeout)
return await stage_func()
async def load_existing_result(task_folder: Path) -> dict:
"""Load existing result if available"""
result_file = task_folder / 'result.json'
if not result_file.exists():
raise FileNotFoundError('No existing result found')
async with await anyio.open_file(result_file) as f:
existing_result = json.loads(await f.read())
# Check if evaluation is also present
existing_eval = existing_result.get('Online_Mind2Web_evaluation')
if existing_eval:
existing_result['has_evaluation'] = True
existing_result['evaluation_data'] = existing_eval
else:
existing_result['has_evaluation'] = False
return existing_result
async def setup_browser_session(task: Task, headless: bool) -> BrowserSession:
"""Setup browser session for the task"""
# Create unique user data directory
base_user_data_dir = Path(BrowserProfile().user_data_dir).parent
unique_user_data_dir = base_user_data_dir / f'task_{task.task_id}'
unique_user_data_dir.mkdir(parents=True, exist_ok=True)
browser_session = BrowserSession(
browser_profile=BrowserProfile(
user_data_dir=str(unique_user_data_dir),
headless=headless,
chromium_sandbox=False,
),
)
await browser_session.start()
return browser_session
async def run_agent_with_browser(
browser_session: BrowserSession, task: Task, llm: BaseChatModel, max_steps: int, use_vision: bool
) -> AgentHistoryList:
"""Run agent with the browser session"""
initial_actions = [{'go_to_url': {'url': task.website}}]
agent = Agent(
task=task.confirmed_task,
llm=llm,
browser_session=browser_session,
initial_actions=initial_actions,
use_vision=use_vision,
source='eval_platform',
)
await agent.run(max_steps=max_steps)
return agent.state.history
async def evaluate_task_result(eval_model: BaseChatModel, task_folder: Path) -> dict:
"""Evaluate the task result"""
return await judge_task_result(eval_model, task_folder, score_threshold=3)
def save_result_to_server(convex_url: str, secret_key: str, payload: dict) -> bool:
"""Save result to server (sync function for use with asyncio.to_thread)"""
return save_task_result_to_server(convex_url, secret_key, payload)
async def cleanup_browser_safe(browser_session: BrowserSession):
"""Safe browser cleanup with timeout"""
try:
await asyncio.wait_for(browser_session.close(), timeout=30)
except Exception as e:
logger.warning(f'Browser cleanup failed: {e}')
def determine_current_stage(completed_stages: set) -> Stage:
"""Determine current stage based on completed stages"""
if Stage.SAVE_SERVER in completed_stages:
return Stage.SAVE_SERVER
elif Stage.EVALUATE in completed_stages:
return Stage.EVALUATE
elif Stage.FORMAT_HISTORY in completed_stages:
return Stage.FORMAT_HISTORY
elif Stage.RUN_AGENT in completed_stages:
return Stage.RUN_AGENT
elif Stage.SETUP_BROWSER in completed_stages:
return Stage.SETUP_BROWSER
elif Stage.LOAD_EXISTING in completed_stages:
return Stage.LOAD_EXISTING
else:
return Stage.LOAD_EXISTING # Default starting stage
async def run_task_with_semaphore(
task: Task,
run_id: str,
convex_url: str,
secret_key: str,
eval_model: BaseChatModel,
llm: BaseChatModel,
max_steps_per_task: int,
headless: bool,
use_vision: bool,
semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument
fresh_start: bool = True,
) -> dict:
"""Clean pipeline approach for running tasks"""
async with semaphore_runs:
task_result = TaskResult(task.task_id, run_id, task.confirmed_task, task, max_steps_per_task)
browser_session = None
task_folder = Path(f'saved_trajectories/{task.task_id}')
try:
# Stage 1: Try to load existing result
try:
existing_data = await run_stage(Stage.LOAD_EXISTING, lambda: load_existing_result(task_folder))
task_result.stage_completed(Stage.LOAD_EXISTING, existing_data)
# If evaluation is also present, mark it as completed
if existing_data.get('has_evaluation'):
task_result.stage_completed(Stage.EVALUATE, existing_data['evaluation_data'])
logger.info(f'Task {task.task_id}: Successfully loaded existing result. Skipping execution.')
except Exception:
# No existing result, need to execute full pipeline
logger.info(f'Task {task.task_id}: No existing result found. Starting execution pipeline.')
# Stage 2: Setup browser
browser_session = await run_stage(Stage.SETUP_BROWSER, lambda: setup_browser_session(task, headless), timeout=120)
task_result.stage_completed(Stage.SETUP_BROWSER)
logger.info(f'Task {task.task_id}: Browser session started.')
# Stage 3: Run agent
agent_history = await run_stage(
Stage.RUN_AGENT,
lambda: run_agent_with_browser(browser_session, task, llm, max_steps_per_task, use_vision),
timeout=600,
)
task_result.stage_completed(Stage.RUN_AGENT)
logger.info(f'Task {task.task_id}: Agent run completed.')
# Stage 4: Format history
formatted_data = await run_stage(
Stage.FORMAT_HISTORY, lambda: reformat_agent_history(agent_history, task.task_id, run_id, task.confirmed_task)
)
task_result.stage_completed(Stage.FORMAT_HISTORY, formatted_data)
logger.info(f'Task {task.task_id}: Agent history formatted.')
# Stage 5: Evaluate (if we have execution data and no existing evaluation)
if task_result.has_execution_data() and Stage.EVALUATE not in task_result.completed_stages:
try:
evaluation = await run_stage(
Stage.EVALUATE, lambda: evaluate_task_result(eval_model, task_folder), timeout=300
)
task_result.stage_completed(Stage.EVALUATE, evaluation)
logger.info(f'Task {task.task_id}: Evaluation completed.')
except Exception as e:
error = StageError(Stage.EVALUATE, 'exception', str(e))
task_result.stage_failed(Stage.EVALUATE, error)
logger.error(f'Task {task.task_id}: Evaluation failed: {str(e)}')
# Stage 6: Save to server (always attempt)
try:
await run_stage(
Stage.SAVE_SERVER,
lambda: asyncio.to_thread(save_result_to_server, convex_url, secret_key, task_result.server_payload),
timeout=60,
)
task_result.stage_completed(Stage.SAVE_SERVER)
logger.info(f'Task {task.task_id}: Successfully saved result to server.')
except Exception as e:
error = StageError(Stage.SAVE_SERVER, 'exception', str(e))
task_result.stage_failed(Stage.SAVE_SERVER, error)
task_result.mark_server_save_failed(str(e))
logger.error(f'Task {task.task_id}: Server save failed: {str(e)}')
except TimeoutError:
current_stage = determine_current_stage(task_result.completed_stages)
error = StageError(current_stage, 'timeout', 'Operation timed out')
task_result.stage_failed(current_stage, error)
logger.error(f'Task {task.task_id}: {current_stage.value} timed out')
except asyncio.CancelledError:
task_result.mark_cancelled()
logger.warning(f'Task {task.task_id}: Task was cancelled')
except Exception as e:
logger.error(f'Task {task.task_id}: Exception during attempt to save result to server: {type(e).__name__}: {str(e)}')
if local_processing_error:
local_processing_error += f'; Server save exception: {str(e)}'
else:
local_processing_error = f'Server save exception: {str(e)}'
task_result.mark_critical_error(str(e))
logger.critical(f'Task {task.task_id}: Critical error: {str(e)}', exc_info=True)
# --- Return Local Processing Status ---
local_task_status['success'] = execution_succeeded and evaluation_succeeded
local_task_status['error'] = local_processing_error
finally:
# Always cleanup browser if it was created
if browser_session:
await cleanup_browser_safe(browser_session)
return local_task_status
return task_result.get_local_status()
async def run_multiple_tasks(
@@ -1119,9 +1171,19 @@ async def run_multiple_tasks(
fresh_start=fresh_start,
)
for task in tasks_to_run
)
),
return_exceptions=True, # Prevent task cancellation cascade
)
# Process task results and handle any exceptions returned by gather
processed_results = []
for i, result in enumerate(task_results):
if isinstance(result, Exception):
logger.error(f'Task {i} failed with exception: {type(result).__name__}: {result}')
processed_results.append({'task_id': f'task_{i}', 'success': False, 'error': str(result)})
else:
processed_results.append(result)
# After all tasks are complete, calculate a local summary
logger.info('All tasks completed. Calculating result summary...')
summary = calculate_local_summary()
@@ -1131,7 +1193,7 @@ async def run_multiple_tasks(
logger.info(f'Success rate: {summary["success_rate"]:.2%}')
logger.info(f'Average score: {summary["average_score"]:.2f}')
return {'task_results': task_results, 'summary': summary}
return {'task_results': processed_results, 'summary': summary}
# Helper function to fetch tasks from the server