mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
Improves evaluation robustness and reporting
Enhances evaluation by improving error handling, providing more detailed logging, and adding a local summary calculation. The changes include: - Adds comprehensive judge fallback to Mind2Web judge and ensures backward compatibility. - Improves error handling during evaluation by capturing and logging the last part of the output on failure. - Adds a new function to calculate a summary of local evaluation results, displaying total tasks, success rate, and average score. - Includes comprehensive evaluation data for debugging purposes.
This commit is contained in:
12
.github/workflows/eval.yaml
vendored
12
.github/workflows/eval.yaml
vendored
@@ -250,21 +250,31 @@ jobs:
|
||||
echo "=== STARTING EVALUATION ==="
|
||||
echo "Command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
|
||||
echo "Starting time: $(date)"
|
||||
echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
|
||||
echo "============================"
|
||||
|
||||
# Set up signal handlers and run the command
|
||||
set -e
|
||||
trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log; exit 130' INT TERM
|
||||
|
||||
# Run the evaluation with output capture
|
||||
# Run the evaluation with output capture and better error handling
|
||||
set +e # Don't exit on errors, we want to capture them
|
||||
${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
|
||||
EVAL_EXIT_CODE=${PIPESTATUS[0]}
|
||||
set -e # Re-enable exit on error
|
||||
|
||||
echo "=== EVALUATION COMPLETED ==="
|
||||
echo "Exit code: $EVAL_EXIT_CODE"
|
||||
echo "Completion time: $(date)"
|
||||
echo "============================"
|
||||
|
||||
# Show last part of log for context
|
||||
if [ $EVAL_EXIT_CODE -ne 0 ]; then
|
||||
echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
|
||||
tail -100 eval_output.log
|
||||
echo "=================================================="
|
||||
fi
|
||||
|
||||
exit $EVAL_EXIT_CODE
|
||||
|
||||
- name: Post-execution Resource Check
|
||||
|
||||
Reference in New Issue
Block a user