browser-use/eval/task_types.py

import json
import logging
from dataclasses import dataclass, field
from enum import Enum
from typing import Any

from eval.utils import create_pydantic_model_from_schema, make_json_serializable

logger = logging.getLogger(__name__)


class Stage(Enum):
	SETUP_BROWSER = 'setup_browser'
	RUN_AGENT = 'run_agent'
	FORMAT_HISTORY = 'format_history'
	EVALUATE = 'evaluate'
	SAVE_SERVER = 'save_server'


@dataclass
class StageError:
	stage: Stage
	error_type: str
	message: str


@dataclass
class TaskResult:
	task_id: str
	run_id: str
	confirmed_task: str
	task: Any
	max_steps: int
	laminar_link: str | None = None
	github_workflow_url: str | None = None
	completed_stages: set[Stage] = field(default_factory=set)
	stage_data: dict[Stage, Any] = field(default_factory=dict)
	errors: list = field(default_factory=list)
	cancelled: bool = False
	critical_error: str | None = None
	server_save_failed: bool = False

	def stage_completed(self, stage: Stage, data: Any = None):
		self.completed_stages.add(stage)
		if data is not None:
			self.stage_data[stage] = data

	def stage_failed(self, stage: Stage, error: StageError):
		self.errors.append(error)

	def mark_cancelled(self):
		self.cancelled = True

	def mark_critical_error(self, error: str):
		self.critical_error = error

	def mark_server_save_failed(self, error: str):
		self.server_save_failed = True
		self.errors.append(StageError(Stage.SAVE_SERVER, 'server_save', error))

	def has_execution_data(self) -> bool:
		return Stage.RUN_AGENT in self.completed_stages or Stage.FORMAT_HISTORY in self.completed_stages

	@property
	def server_payload(self) -> dict[str, Any]:
		"""Generate payload for server submission"""
		payload = {
			'taskId': self.task_id,
			'runId': self.run_id,
			'task': self.confirmed_task,
			'completed_stages': [stage.value for stage in self.completed_stages],
			'has_errors': len(self.errors) > 0,
			'cancelled': self.cancelled,
			'critical_error': self.critical_error,
			'server_save_failed': self.server_save_failed,
			'laminarTaskLink': self.laminar_link,
			'githubWorkflowUrl': self.github_workflow_url,
		}

		# Add task execution data if available
		if Stage.FORMAT_HISTORY in self.completed_stages:
			format_data = self.stage_data.get(Stage.FORMAT_HISTORY, {})
			logger.info(f'format_data: {format_data}')
			# log token usage
			logger.info(f'tokensUsed: {format_data.get("tokensUsed")}')
			logger.info(f'usage: {format_data.get("usage")}')

			# Handle usage data - convert to JSON string if it's a dict
			usage_data = format_data.get('usage')
			if usage_data and isinstance(usage_data, dict):
				usage_data = json.dumps(usage_data)

			payload.update(
				{
					'actionHistory': format_data.get('action_history', []),
					'finalResultResponse': format_data.get('final_result_response', ''),
					'selfReportCompleted': format_data.get('self_report_completed', False),
					'selfReportSuccess': format_data.get('self_report_success', False),
					'taskDuration': format_data.get('task_duration'),
					'steps': format_data.get('steps'),
					'maxSteps': self.max_steps,
					'tokensUsed': format_data.get('tokensUsed'),
					'usage': usage_data,  # Add usage data (JSON string if dict)
					'completeHistory': format_data.get('complete_history', []),  # Add complete step history
				}
			)

		# Add evaluation data if available
		if Stage.EVALUATE in self.completed_stages:
			eval_data = self.stage_data.get(Stage.EVALUATE, {})

			# Handle comprehensive judge evaluation
			comp_eval = eval_data.get('comprehensive_evaluation') or eval_data.get('comprehensive_judge')
			if comp_eval:
				# Convert enum lists to string lists for database storage
				task_categories = comp_eval.get('task_categories', [])
				if task_categories and hasattr(task_categories[0], 'value'):
					task_categories = [cat.value for cat in task_categories]

				error_categories = comp_eval.get('error_categories', [])
				if error_categories and hasattr(error_categories[0], 'value'):
					error_categories = [err.value for err in error_categories]

				payload.update(
					{
						'comprehensiveJudgeEvaluationSummary': comp_eval.get('task_summary'),
						'comprehensiveJudgeEvaluationReasoning': comp_eval.get('reasoning'),
						'comprehensiveJudgeEvaluationPassed': comp_eval.get('passed'),
						'comprehensiveJudgeEvaluationScore': comp_eval.get('final_score'),
						'comprehensiveJudgeEvaluationCategories': task_categories,
						'comprehensiveJudgeEvaluationErrors': error_categories,
						'comprehensiveJudgeEvaluationTips': comp_eval.get('improvement_tips', []),
						'comprehensiveJudgeEvaluationCriticalIssues': comp_eval.get('critical_issues', []),
						'comprehensiveJudgeEvaluationScores': comp_eval.get('scores'),
						'comprehensiveJudgeEvaluationFull': comp_eval,  # Include full comprehensive eval data
					}
				)

			# Handle legacy Mind2Web evaluation (for compatibility)
			payload.update(
				{
					'onlineMind2WebEvaluationJudgement': eval_data.get('judgement') or 'No evaluation available',
					'onlineMind2WebEvaluationError': eval_data.get('error'),
					'onlineMind2WebEvaluationSuccess': eval_data.get('success', False),
					'onlineMind2WebEvaluationScore': eval_data.get('score', 0.0),
				}
			)

		# Ensure all data in payload is JSON serializable
		serialized_payload = make_json_serializable(payload)
		# Type assertion since we know payload is a dict and make_json_serializable preserves dict structure
		assert isinstance(serialized_payload, dict), 'Payload serialization should preserve dict structure'
		return serialized_payload

	def get_local_status(self) -> dict[str, Any]:
		"""Get local status summary"""
		success = (
			Stage.EVALUATE in self.completed_stages
			and not self.cancelled
			and self.critical_error is None
			and len([e for e in self.errors if e.error_type == 'exception']) == 0
		)
		return {
			'task_id': self.task_id,
			'success': success,
			'error': self.critical_error or (self.errors[0].message if self.errors else None),
			'completed_stages': [stage.value for stage in self.completed_stages],
		}


class Task:
	def __init__(self, task_id, confirmed_task, **kwargs):
		# Validate required fields
		if not task_id:
			raise ValueError('task_id is required and cannot be empty')
		if not confirmed_task:
			raise ValueError('confirmed_task is required and cannot be empty')

		# Set required fields
		self.task_id = task_id
		self.confirmed_task = confirmed_task

		# Set optional fields dynamically
		# Known optional fields with defaults
		self.website = kwargs.get('website', None)
		self.reference_length = kwargs.get('reference_length', None)
		self.level = kwargs.get('level', None)
		self.cluster_id = kwargs.get('cluster_id', None)
		self.login_cookie = kwargs.get('login_cookie', None)
		self.login_type = kwargs.get('login_type', None)
		self.category = kwargs.get('category', None)
		self.output_schema = kwargs.get('output_schema', None)  # Add structured output schema support
		self.auth_keys = kwargs.get('auth_keys', None)  # List of auth keys to fetch from auth distribution
		if self.output_schema:
			# Convert JSON schema to Pydantic model class
			self.output_model = create_pydantic_model_from_schema(self.output_schema, f'Task_{self.task_id}_Output')
		else:
			self.output_model = None

		# Store any additional optional fields
		known_fields = {
			'website',
			'reference_length',
			'level',
			'cluster_id',
			'login_cookie',
			'login_type',
			'category',
			'output_schema',
			'auth_keys',
		}
		self.additional_fields = {k: v for k, v in kwargs.items() if k not in known_fields}

		# Make all additional fields accessible as attributes
		for key, value in self.additional_fields.items():
			setattr(self, key, value)

	def __str__(self):
		# Include main fields and indicate if there are additional fields
		base_str = f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level}, cluster_id={self.cluster_id}, login_cookie={self.login_cookie}, login_type={self.login_type}, category={self.category}, output_schema={self.output_schema}, auth_keys={self.auth_keys}'
		if self.additional_fields:
			additional_str = ', '.join(f'{k}={v}' for k, v in self.additional_fields.items())
			base_str += f', {additional_str}'
		base_str += ')'
		return base_str

	def __repr__(self):
		return self.__str__()