Files
browser-use/.github/workflows/eval.yaml
2025-06-23 15:32:28 -07:00

392 lines
15 KiB
YAML

name: Run Evaluation Script
on:
repository_dispatch:
types: [run-eval]
workflow_dispatch:
jobs:
run_evaluation:
runs-on: ubuntu-latest
timeout-minutes: 360
env:
IN_DOCKER: 'true'
ANONYMIZED_TELEMETRY: 'false'
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }}
EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}
ANCHOR_BROWSER_API_KEY: ${{ secrets.ANCHOR_BROWSER_API_KEY }}
SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }}
steps:
- name: System Info and Resource Check
run: |
echo "=== SYSTEM INFORMATION ==="
echo "Runner OS: $(uname -a)"
echo "CPU Info: $(nproc) cores"
echo "Memory Info:"
free -h
echo "Disk Space:"
df -h
echo "Load Average:"
uptime
echo "=========================="
- name: Determine ref to checkout
id: determine_ref
run: |
# Use the ref from client_payload or default to main
# This can be a branch name, tag, commit SHA, or any valid Git ref
REF="${{ github.event.client_payload.ref }}"
REF="${REF:-main}"
echo "REF=$REF" >> $GITHUB_OUTPUT
echo "::notice title=Ref Selection::Will checkout and run evaluation on ref: $REF"
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ steps.determine_ref.outputs.REF }}
- name: Set up Python and uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
activate-environment: true
- name: Install dependencies
run: |
echo "Installing dependencies..."
uv sync --extra eval
echo "Dependencies installed successfully"
- name: Detect installed Playwright version
id: playwright_version
run: echo "VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_OUTPUT
- name: Cache Playwright browsers
uses: actions/cache@v4
with:
path: ~/.cache/ms-playwright
key: ${{ runner.os }}-playwright-${{ steps.playwright_version.outputs.VERSION }}
restore-keys: |
${{ runner.os }}-playwright-
- name: Install Playwright browser dependencies
run: |
echo "Installing Playwright browsers..."
playwright install --no-shell chromium
echo "Playwright browsers installed successfully"
- name: Install Xvfb for headed mode
if: github.event.client_payload.script_args.headless == 'false'
run: |
echo "Installing Xvfb for headed mode..."
sudo apt-get update
sudo apt-get install -y xvfb
echo "Xvfb installed successfully"
- name: Pre-execution Resource Check
run: |
echo "=== PRE-EXECUTION RESOURCE CHECK ==="
echo "Memory usage:"
free -h
echo "CPU load:"
uptime
echo "Disk usage:"
df -h
echo "Process count:"
ps aux | wc -l
echo "================================="
- name: Construct eval command
id: eval_command
run: |
# Centralized defaults
DEFAULT_MODEL="llama-4-maverick"
DEFAULT_EVAL_MODEL="gpt-4o"
DEFAULT_PARALLEL_RUNS="2"
DEFAULT_MAX_STEPS="25"
DEFAULT_START_INDEX="0"
DEFAULT_END_INDEX="100"
DEFAULT_EVAL_GROUP="PRTests"
DEFAULT_HEADLESS="true"
DEFAULT_MEMORY_INTERVAL="10"
DEFAULT_MAX_ACTIONS_PER_STEP="10"
DEFAULT_PLANNER_INTERVAL="1"
DEFAULT_TEST_CASE="OnlineMind2Web"
# Extract and apply defaults using parameter expansion
MODEL="${{ github.event.client_payload.script_args.model }}"
MODEL="${MODEL:-$DEFAULT_MODEL}"
EVAL_MODEL="${{ github.event.client_payload.script_args.eval_model }}"
EVAL_MODEL="${EVAL_MODEL:-$DEFAULT_EVAL_MODEL}"
PARALLEL_RUNS="${{ github.event.client_payload.script_args.parallel_runs }}"
PARALLEL_RUNS="${PARALLEL_RUNS:-$DEFAULT_PARALLEL_RUNS}"
MAX_STEPS="${{ github.event.client_payload.script_args.max_steps }}"
MAX_STEPS="${MAX_STEPS:-$DEFAULT_MAX_STEPS}"
START_INDEX="${{ github.event.client_payload.script_args.start_index }}"
START_INDEX="${START_INDEX:-$DEFAULT_START_INDEX}"
END_INDEX="${{ github.event.client_payload.script_args.end_index }}"
END_INDEX="${END_INDEX:-$DEFAULT_END_INDEX}"
EVAL_GROUP="${{ github.event.client_payload.script_args.eval_group }}"
EVAL_GROUP="${EVAL_GROUP:-$DEFAULT_EVAL_GROUP}"
HEADLESS="${{ github.event.client_payload.script_args.headless }}"
HEADLESS="${HEADLESS:-$DEFAULT_HEADLESS}"
MEMORY_INTERVAL="${{ github.event.client_payload.script_args.memory_interval }}"
MEMORY_INTERVAL="${MEMORY_INTERVAL:-$DEFAULT_MEMORY_INTERVAL}"
MAX_ACTIONS_PER_STEP="${{ github.event.client_payload.script_args.max_actions_per_step }}"
MAX_ACTIONS_PER_STEP="${MAX_ACTIONS_PER_STEP:-$DEFAULT_MAX_ACTIONS_PER_STEP}"
PLANNER_INTERVAL="${{ github.event.client_payload.script_args.planner_interval }}"
PLANNER_INTERVAL="${PLANNER_INTERVAL:-$DEFAULT_PLANNER_INTERVAL}"
TEST_CASE="${{ github.event.client_payload.script_args.test_case }}"
TEST_CASE="${TEST_CASE:-$DEFAULT_TEST_CASE}"
# Optional parameters (no defaults)
USER_MESSAGE="${{ github.event.client_payload.script_args.user_message }}"
DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}"
PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}"
RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}"
# Build command using array for cleaner construction
CMD_ARGS=(
"python" "eval/service.py"
"--model" "$MODEL"
"--eval-model" "$EVAL_MODEL"
"--parallel-runs" "$PARALLEL_RUNS"
"--max-steps" "$MAX_STEPS"
"--start" "$START_INDEX"
"--end" "$END_INDEX"
"--eval-group" "$EVAL_GROUP"
"--memory-interval" "$MEMORY_INTERVAL"
"--max-actions-per-step" "$MAX_ACTIONS_PER_STEP"
"--planner-interval" "$PLANNER_INTERVAL"
"--test-case" "$TEST_CASE"
)
# Add boolean flags conditionally
[[ "${{ github.event.client_payload.script_args.no_vision }}" == "true" ]] && CMD_ARGS+=("--no-vision")
[[ "$HEADLESS" == "true" ]] && CMD_ARGS+=("--headless")
[[ "${{ github.event.client_payload.script_args.use_serp }}" == "true" ]] && CMD_ARGS+=("--use-serp")
[[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory")
[[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output")
[[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result")
[[ "${{ github.event.client_payload.script_args.highlight_elements }}" == "false" ]] && CMD_ARGS+=("--no-highlight-elements")
[[ "${{ github.event.client_payload.script_args.use_mind2web_judge }}" == "true" ]] && CMD_ARGS+=("--use-mind2web-judge")
# Add optional string parameters
[[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE")
[[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID")
[[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL")
[[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID")
[[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID")
# Convert array to command string with proper escaping
printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"
# Add xvfb wrapper if needed
if [[ "$HEADLESS" == "false" ]]; then
CMD_STRING="xvfb-run --auto-servernum --server-args='-screen 0 1280x1024x24' $CMD_STRING"
fi
echo "FULL_COMMAND=$CMD_STRING" >> $GITHUB_OUTPUT
echo "::notice title=Eval Command::$CMD_STRING"
- name: Start Resource Monitoring
run: |
echo "Starting background resource monitoring..."
# Create a background script that monitors resources every 30 seconds
cat > monitor_resources.sh << 'EOF'
#!/bin/bash
while true; do
echo "=== RESOURCE MONITOR $(date) ==="
echo "Memory:"
free -h
echo "CPU Load:"
uptime
echo "Top processes by CPU:"
ps aux --sort=-%cpu | head -10
echo "Top processes by Memory:"
ps aux --sort=-%mem | head -10
echo "Chrome/Chromium processes:"
ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
echo "Python processes:"
ps aux | grep python | grep -v grep || echo "No Python processes found"
echo "=================================="
sleep 30
done
EOF
chmod +x monitor_resources.sh
# Start the monitor in background and save PID
nohup ./monitor_resources.sh > resource_monitor.log 2>&1 &
echo $! > monitor_pid.txt
echo "Resource monitoring started with PID: $(cat monitor_pid.txt)"
- name: Run evaluation script
id: run_eval
run: |
echo "=== STARTING EVALUATION ==="
echo "Command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
echo "Starting time: $(date)"
echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
echo "============================"
# Set up signal handlers and run the command
set -e
trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log; exit 130' INT TERM
# Run the evaluation with output capture and better error handling
set +e # Don't exit on errors, we want to capture them
${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
EVAL_EXIT_CODE=${PIPESTATUS[0]}
set -e # Re-enable exit on error
echo "=== EVALUATION COMPLETED ==="
echo "Exit code: $EVAL_EXIT_CODE"
echo "Completion time: $(date)"
echo "============================"
# Show last part of log for context
if [ $EVAL_EXIT_CODE -ne 0 ]; then
echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
tail -100 eval_output.log
echo "=================================================="
fi
exit $EVAL_EXIT_CODE
- name: Post-execution Resource Check
if: always()
run: |
echo "=== POST-EXECUTION RESOURCE CHECK ==="
echo "Memory usage:"
free -h
echo "CPU load:"
uptime
echo "Disk usage:"
df -h
echo "Process count:"
ps aux | wc -l
echo "Chrome/Chromium processes still running:"
ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
echo "Python processes still running:"
ps aux | grep python | grep -v grep || echo "No Python processes found"
echo "==================================="
- name: Stop Resource Monitoring and Collect Logs
if: always()
run: |
echo "Stopping resource monitoring..."
if [ -f monitor_pid.txt ]; then
MONITOR_PID=$(cat monitor_pid.txt)
if kill -0 $MONITOR_PID 2>/dev/null; then
kill $MONITOR_PID
echo "Resource monitor stopped"
else
echo "Resource monitor was already stopped"
fi
fi
echo "=== RESOURCE MONITORING LOG ==="
if [ -f resource_monitor.log ]; then
tail -100 resource_monitor.log
else
echo "No resource monitor log found"
fi
echo "==============================="
- name: Collect Debug Information
if: always()
run: |
echo "=== COLLECTING DEBUG INFORMATION ==="
# System information
echo "Final system state:"
uptime
free -h
df -h
# Process information
echo "All running processes:"
ps aux --sort=-%cpu | head -20
# Check for core dumps
echo "Checking for core dumps:"
find . -name "core*" -type f 2>/dev/null || echo "No core dumps found"
# Check for any crash logs
echo "Checking for crash logs:"
find . -name "*crash*" -type f 2>/dev/null || echo "No crash logs found"
# Check kernel messages for OOM kills
echo "Checking for OOM kills in kernel messages:"
sudo dmesg | grep -i "killed process" | tail -10 || echo "No OOM kills found"
# Check evaluation output
echo "Last 100 lines of evaluation output:"
if [ -f eval_output.log ]; then
tail -100 eval_output.log
else
echo "No evaluation output log found"
fi
# Check for saved trajectories
echo "Saved trajectories directory:"
if [ -d saved_trajectories ]; then
find saved_trajectories -type f -name "*.json" | head -10
echo "Total trajectory files: $(find saved_trajectories -type f -name "*.json" | wc -l)"
else
echo "No saved_trajectories directory found"
fi
echo "===================================="
- name: Upload Debug Artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: debug-logs-${{ github.run_id }}
path: |
eval_output.log
resource_monitor.log
saved_trajectories/
retention-days: 7
- name: Final Status Summary
if: always()
run: |
echo "=== FINAL STATUS SUMMARY ==="
echo "Workflow run ID: ${{ github.run_id }}"
echo "Job completion time: $(date)"
echo "Evaluation step status: ${{ steps.run_eval.outcome }}"
if [ "${{ steps.run_eval.outcome }}" != "success" ]; then
echo "❌ Evaluation failed or was interrupted"
echo "Check the debug artifacts and logs above for more information"
else
echo "✅ Evaluation completed successfully"
fi
echo "==========================="