Improves evaluation robustness and reporting

Enhances evaluation by improving error handling, providing more detailed logging, and adding a local summary calculation. The changes include: - Adds comprehensive judge fallback to Mind2Web judge and ensures backward compatibility. - Improves error handling during evaluation by capturing and logging the last part of the output on failure. - Adds a new function to calculate a summary of local evaluation results, displaying total tasks, success rate, and average score. - Includes comprehensive evaluation data for debugging purposes.
2026-05-06 17:52:15 +02:00 · 2025-06-23 00:08:14 +02:00
parent be170fb17a
commit d4a29c4b93
2 changed files with 89 additions and 6 deletions
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -250,21 +250,31 @@ jobs:
          echo "=== STARTING EVALUATION ==="
          echo "Command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
          echo "Starting time: $(date)"
+          echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
          echo "============================"
          
          # Set up signal handlers and run the command
          set -e
          trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log; exit 130' INT TERM
          
-          # Run the evaluation with output capture
+          # Run the evaluation with output capture and better error handling
+          set +e  # Don't exit on errors, we want to capture them
          ${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
          EVAL_EXIT_CODE=${PIPESTATUS[0]}
+          set -e  # Re-enable exit on error
          
          echo "=== EVALUATION COMPLETED ==="
          echo "Exit code: $EVAL_EXIT_CODE"
          echo "Completion time: $(date)"
          echo "============================"
          
+          # Show last part of log for context
+          if [ $EVAL_EXIT_CODE -ne 0 ]; then
+            echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
+            tail -100 eval_output.log
+            echo "=================================================="
+          fi
+          
          exit $EVAL_EXIT_CODE

      - name: Post-execution Resource Check
--- a/eval/service.py
+++ b/eval/service.py
@@ -493,8 +493,17 @@ from typing import Any
 import requests
 from dotenv import load_dotenv

-# Import the new comprehensive judge system
-from judge_system import evaluate_task_with_comprehensive_judge
+# Import the new comprehensive judge system (conditional import for backwards compatibility)
+try:
+	from judge_system import evaluate_task_with_comprehensive_judge
+
+	COMPREHENSIVE_JUDGE_AVAILABLE = True
+except ImportError:
+	logger.warning('Comprehensive judge system not available. Only Mind2Web judge will be available.')
+	COMPREHENSIVE_JUDGE_AVAILABLE = False
+
+	def evaluate_task_with_comprehensive_judge(*args, **kwargs):
+		raise ImportError('Comprehensive judge system not available')


 class Stage(Enum):
@@ -577,6 +586,7 @@ class TaskResult:
 					'steps': format_data.get('steps'),
 					'maxSteps': self.max_steps,
 					'tokensUsed': format_data.get('tokensUsed'),
+					'completeHistory': format_data.get('complete_history', []),  # Add complete step history
 				}
 			)

@@ -597,6 +607,7 @@ class TaskResult:
 						'comprehensiveJudgeEvaluationErrors': comp_eval.get('error_categories', []),
 						'comprehensiveJudgeEvaluationTips': comp_eval.get('improvement_tips', []),
 						'comprehensiveJudgeEvaluationScores': comp_eval.get('scores'),
+						'comprehensiveJudgeEvaluationFull': comp_eval,  # Include full comprehensive eval data
 					}
 				)

@@ -632,9 +643,66 @@ class TaskResult:
 		}


-def calculate_local_summary():
-	"""Calculate summary of local evaluation results (stub implementation)"""
-	return {'total_tasks': 0, 'success_rate': 0.0, 'average_score': 0.0, 'message': 'Local summary calculation not implemented'}
+def calculate_local_summary(results_dir: str = 'saved_trajectories'):
+	"""Calculate summary of local evaluation results"""
+	try:
+		results_path = Path(results_dir)
+		if not results_path.exists():
+			return {'total_tasks': 0, 'success_rate': 0.0, 'average_score': 0.0, 'message': 'No results directory found'}
+
+		# Find all task result folders
+		task_folders = [f for f in results_path.iterdir() if f.is_dir() and (f / 'result.json').exists()]
+
+		if not task_folders:
+			return {'total_tasks': 0, 'success_rate': 0.0, 'average_score': 0.0, 'message': 'No task results found'}
+
+		total_tasks = len(task_folders)
+		successful_tasks = 0
+		total_score = 0.0
+
+		for task_folder in task_folders:
+			result_file = task_folder / 'result.json'
+			try:
+				with open(result_file) as f:
+					result_data = json.load(f)
+
+				# Check for evaluation results
+				evaluation_success = False
+				task_score = 0.0
+
+				# Check comprehensive judge evaluation
+				comp_eval = result_data.get('comprehensive_judge_evaluation')
+				if comp_eval:
+					evaluation_success = comp_eval.get('passed', False)
+					task_score = comp_eval.get('final_score', 0) / 100.0
+				else:
+					# Check Mind2Web evaluation
+					mind2web_eval = result_data.get('Online_Mind2Web_evaluation')
+					if mind2web_eval:
+						evaluation_success = mind2web_eval.get('success', False)
+						task_score = mind2web_eval.get('score', 0.0)
+
+				if evaluation_success:
+					successful_tasks += 1
+				total_score += task_score
+
+			except Exception as e:
+				logger.warning(f'Failed to read result for {task_folder.name}: {e}')
+
+		success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0.0
+		average_score = total_score / total_tasks if total_tasks > 0 else 0.0
+
+		return {
+			'total_tasks': total_tasks,
+			'success_rate': success_rate,
+			'average_score': average_score,
+			'successful_tasks': successful_tasks,
+			'message': f'Processed {total_tasks} tasks successfully',
+		}
+
+	except Exception as e:
+		logger.error(f'Failed to calculate local summary: {e}')
+		return {'total_tasks': 0, 'success_rate': 0.0, 'average_score': 0.0, 'message': f'Error: {str(e)}'}


 from langchain_anthropic import ChatAnthropic
@@ -1165,6 +1233,11 @@ async def judge_task_result(model, task_folder: Path, score_threshold: float = 3
 			# Use the new comprehensive judge system (default)
 			logger.info(f'Task {task_folder.name}: Using comprehensive judge evaluation')

+			# Check if comprehensive judge is available
+			if not COMPREHENSIVE_JUDGE_AVAILABLE:
+				logger.warning(f'Task {task_folder.name}: Comprehensive judge not available, falling back to Mind2Web')
+				return await judge_task_result(model, task_folder, score_threshold, use_mind2web=True)
+
 			# Check if comprehensive judge result already exists
 			if result.get('comprehensive_judge_evaluation'):
 				existing_eval = result['comprehensive_judge_evaluation']