Improves evaluation robustness and reporting

Enhances evaluation by improving error handling, providing more detailed logging, and adding a local summary calculation. The changes include: - Adds comprehensive judge fallback to Mind2Web judge and ensures backward compatibility. - Improves error handling during evaluation by capturing and logging the last part of the output on failure. - Adds a new function to calculate a summary of local evaluation results, displaying total tasks, success rate, and average score. - Includes comprehensive evaluation data for debugging purposes.
2026-05-06 17:52:15 +02:00 · 2025-06-23 00:08:14 +02:00
parent be170fb17a
commit d4a29c4b93
2 changed files with 89 additions and 6 deletions
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -250,21 +250,31 @@ jobs:
          echo "=== STARTING EVALUATION ==="
          echo "Command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
          echo "Starting time: $(date)"
+          echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
          echo "============================"
          
          # Set up signal handlers and run the command
          set -e
          trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log; exit 130' INT TERM
          
-          # Run the evaluation with output capture
+          # Run the evaluation with output capture and better error handling
+          set +e  # Don't exit on errors, we want to capture them
          ${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
          EVAL_EXIT_CODE=${PIPESTATUS[0]}
+          set -e  # Re-enable exit on error
          
          echo "=== EVALUATION COMPLETED ==="
          echo "Exit code: $EVAL_EXIT_CODE"
          echo "Completion time: $(date)"
          echo "============================"
          
+          # Show last part of log for context
+          if [ $EVAL_EXIT_CODE -ne 0 ]; then
+            echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
+            tail -100 eval_output.log
+            echo "=================================================="
+          fi
+          
          exit $EVAL_EXIT_CODE

      - name: Post-execution Resource Check