browser-use/.github/workflows/eval.yaml

name: Run Evaluation Script

on:
  repository_dispatch:
    types: [run-eval]
  workflow_dispatch:

jobs:
  run_evaluation:
    runs-on: ubuntu-latest
    timeout-minutes: 360
    env:
      IN_DOCKER: 'true'
      ANONYMIZED_TELEMETRY: 'false'
      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
      DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
      GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
      XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
      EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }}
      EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}
      ANCHOR_BROWSER_API_KEY: ${{ secrets.ANCHOR_BROWSER_API_KEY }}
      SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
      LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }}

    steps:
      - name: System Info and Resource Check
        run: |
          echo "=== SYSTEM INFORMATION ==="
          echo "Runner OS: $(uname -a)"
          echo "CPU Info: $(nproc) cores"
          echo "Memory Info:"
          free -h
          echo "Disk Space:"
          df -h
          echo "Load Average:"
          uptime
          echo "=========================="

      - name: Determine ref to checkout
        id: determine_ref
        run: |
          # Use the ref from client_payload or default to main
          # This can be a branch name, tag, commit SHA, or any valid Git ref
          REF="${{ github.event.client_payload.ref }}"
          REF="${REF:-main}"
          echo "REF=$REF" >> $GITHUB_OUTPUT
          echo "::notice title=Ref Selection::Will checkout and run evaluation on ref: $REF"

      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          ref: ${{ steps.determine_ref.outputs.REF }}

      - name: Set up Python and uv
        uses: astral-sh/setup-uv@v6
        with:
          enable-cache: true
          activate-environment: true

      - name: Install dependencies
        run: |
          echo "Installing dependencies..."
          uv sync --extra eval
          echo "Dependencies installed successfully"

      - name: Detect installed Playwright version
        id: playwright_version
        run: echo "VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_OUTPUT

      - name: Cache Playwright browsers
        uses: actions/cache@v4
        with:
          path: ~/.cache/ms-playwright
          key: ${{ runner.os }}-playwright-${{ steps.playwright_version.outputs.VERSION }}
          restore-keys: |
            ${{ runner.os }}-playwright-

      - name: Install Playwright browser dependencies
        run: |
          echo "Installing Playwright browsers..."
          playwright install --no-shell chromium
          echo "Playwright browsers installed successfully"

      - name: Install Xvfb for headed mode
        if: github.event.client_payload.script_args.headless == 'false'
        run: |
          echo "Installing Xvfb for headed mode..."
          sudo apt-get update
          sudo apt-get install -y xvfb
          echo "Xvfb installed successfully"

      - name: Pre-execution Resource Check
        run: |
          echo "=== PRE-EXECUTION RESOURCE CHECK ==="
          echo "Memory usage:"
          free -h
          echo "CPU load:"
          uptime
          echo "Disk usage:"
          df -h
          echo "Process count:"
          ps aux | wc -l
          echo "================================="

      - name: Construct eval command
        id: eval_command
        run: |
          # Centralized defaults
          DEFAULT_MODEL="llama-4-maverick"
          DEFAULT_EVAL_MODEL="gpt-4o"
          DEFAULT_PARALLEL_RUNS="2"
          DEFAULT_MAX_STEPS="25"
          DEFAULT_START_INDEX="0"
          DEFAULT_END_INDEX="100"
          DEFAULT_EVAL_GROUP="PRTests"
          DEFAULT_HEADLESS="true"

          DEFAULT_MEMORY_INTERVAL="10"
          DEFAULT_MAX_ACTIONS_PER_STEP="10"
          DEFAULT_PLANNER_INTERVAL="1"
          DEFAULT_TEST_CASE="OnlineMind2Web"

          # Extract and apply defaults using parameter expansion
          MODEL="${{ github.event.client_payload.script_args.model }}"
          MODEL="${MODEL:-$DEFAULT_MODEL}"

          EVAL_MODEL="${{ github.event.client_payload.script_args.eval_model }}"
          EVAL_MODEL="${EVAL_MODEL:-$DEFAULT_EVAL_MODEL}"

          PARALLEL_RUNS="${{ github.event.client_payload.script_args.parallel_runs }}"
          PARALLEL_RUNS="${PARALLEL_RUNS:-$DEFAULT_PARALLEL_RUNS}"

          MAX_STEPS="${{ github.event.client_payload.script_args.max_steps }}"
          MAX_STEPS="${MAX_STEPS:-$DEFAULT_MAX_STEPS}"

          START_INDEX="${{ github.event.client_payload.script_args.start_index }}"
          START_INDEX="${START_INDEX:-$DEFAULT_START_INDEX}"

          END_INDEX="${{ github.event.client_payload.script_args.end_index }}"
          END_INDEX="${END_INDEX:-$DEFAULT_END_INDEX}"

          EVAL_GROUP="${{ github.event.client_payload.script_args.eval_group }}"
          EVAL_GROUP="${EVAL_GROUP:-$DEFAULT_EVAL_GROUP}"

          HEADLESS="${{ github.event.client_payload.script_args.headless }}"
          HEADLESS="${HEADLESS:-$DEFAULT_HEADLESS}"


          MEMORY_INTERVAL="${{ github.event.client_payload.script_args.memory_interval }}"
          MEMORY_INTERVAL="${MEMORY_INTERVAL:-$DEFAULT_MEMORY_INTERVAL}"

          MAX_ACTIONS_PER_STEP="${{ github.event.client_payload.script_args.max_actions_per_step }}"
          MAX_ACTIONS_PER_STEP="${MAX_ACTIONS_PER_STEP:-$DEFAULT_MAX_ACTIONS_PER_STEP}"

          PLANNER_INTERVAL="${{ github.event.client_payload.script_args.planner_interval }}"
          PLANNER_INTERVAL="${PLANNER_INTERVAL:-$DEFAULT_PLANNER_INTERVAL}"

          TEST_CASE="${{ github.event.client_payload.script_args.test_case }}"
          TEST_CASE="${TEST_CASE:-$DEFAULT_TEST_CASE}"

          # Optional parameters (no defaults)
          USER_MESSAGE="${{ github.event.client_payload.script_args.user_message }}"
          DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}"
          PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}"
          RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
          LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}"

          # Build command using array for cleaner construction
          CMD_ARGS=(
            "python" "eval/service.py"
            "--model" "$MODEL"
            "--eval-model" "$EVAL_MODEL"
            "--parallel-runs" "$PARALLEL_RUNS"
            "--max-steps" "$MAX_STEPS"
            "--start" "$START_INDEX"
            "--end" "$END_INDEX"

            "--eval-group" "$EVAL_GROUP"
            "--memory-interval" "$MEMORY_INTERVAL"
            "--max-actions-per-step" "$MAX_ACTIONS_PER_STEP"
            "--planner-interval" "$PLANNER_INTERVAL"
            "--test-case" "$TEST_CASE"
          )

          # Add boolean flags conditionally
          [[ "${{ github.event.client_payload.script_args.no_vision }}" == "true" ]] && CMD_ARGS+=("--no-vision")
          [[ "$HEADLESS" == "true" ]] && CMD_ARGS+=("--headless")
          [[ "${{ github.event.client_payload.script_args.use_serp }}" == "true" ]] && CMD_ARGS+=("--use-serp")
          [[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory")
          [[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output")
          [[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result")
          [[ "${{ github.event.client_payload.script_args.highlight_elements }}" == "false" ]] && CMD_ARGS+=("--no-highlight-elements")
          [[ "${{ github.event.client_payload.script_args.use_mind2web_judge }}" == "true" ]] && CMD_ARGS+=("--use-mind2web-judge")

          # Add optional string parameters
          [[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE")
          [[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID")
          [[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL")
          [[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID")
          [[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID")

          # Convert array to command string with proper escaping
          printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"

          # Add xvfb wrapper if needed
          if [[ "$HEADLESS" == "false" ]]; then
            CMD_STRING="xvfb-run --auto-servernum --server-args='-screen 0 1280x1024x24' $CMD_STRING"
          fi

          echo "FULL_COMMAND=$CMD_STRING" >> $GITHUB_OUTPUT
          echo "::notice title=Eval Command::$CMD_STRING"

      - name: Start Resource Monitoring
        run: |
          echo "Starting background resource monitoring..."
          # Create a background script that monitors resources every 30 seconds
          cat > monitor_resources.sh << 'EOF'
          #!/bin/bash
          while true; do
            echo "=== RESOURCE MONITOR $(date) ==="
            echo "Memory:"
            free -h
            echo "CPU Load:"
            uptime
            echo "Top processes by CPU:"
            ps aux --sort=-%cpu | head -10
            echo "Top processes by Memory:"
            ps aux --sort=-%mem | head -10
            echo "Chrome/Chromium processes:"
            ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
            echo "Python processes:"
            ps aux | grep python | grep -v grep || echo "No Python processes found"
            echo "=================================="
            sleep 30
          done
          EOF
          chmod +x monitor_resources.sh
          # Start the monitor in background and save PID
          nohup ./monitor_resources.sh > resource_monitor.log 2>&1 &
          echo $! > monitor_pid.txt
          echo "Resource monitoring started with PID: $(cat monitor_pid.txt)"

      - name: Run evaluation script
        id: run_eval
        run: |
          echo "=== STARTING EVALUATION ==="
          echo "Command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
          echo "Starting time: $(date)"
          echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
          echo "============================"

          # Set up signal handlers and run the command
          set -e
          trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log; exit 130' INT TERM

          # Run the evaluation with output capture and better error handling
          set +e  # Don't exit on errors, we want to capture them
          ${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
          EVAL_EXIT_CODE=${PIPESTATUS[0]}
          set -e  # Re-enable exit on error

          echo "=== EVALUATION COMPLETED ==="
          echo "Exit code: $EVAL_EXIT_CODE"
          echo "Completion time: $(date)"
          echo "============================"

          # Show last part of log for context
          if [ $EVAL_EXIT_CODE -ne 0 ]; then
            echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
            tail -100 eval_output.log
            echo "=================================================="
          fi

          exit $EVAL_EXIT_CODE

      - name: Post-execution Resource Check
        if: always()
        run: |
          echo "=== POST-EXECUTION RESOURCE CHECK ==="
          echo "Memory usage:"
          free -h
          echo "CPU load:"
          uptime
          echo "Disk usage:"
          df -h
          echo "Process count:"
          ps aux | wc -l
          echo "Chrome/Chromium processes still running:"
          ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
          echo "Python processes still running:"
          ps aux | grep python | grep -v grep || echo "No Python processes found"
          echo "==================================="

      - name: Stop Resource Monitoring and Collect Logs
        if: always()
        run: |
          echo "Stopping resource monitoring..."
          if [ -f monitor_pid.txt ]; then
            MONITOR_PID=$(cat monitor_pid.txt)
            if kill -0 $MONITOR_PID 2>/dev/null; then
              kill $MONITOR_PID
              echo "Resource monitor stopped"
            else
              echo "Resource monitor was already stopped"
            fi
          fi

          echo "=== RESOURCE MONITORING LOG ==="
          if [ -f resource_monitor.log ]; then
            tail -100 resource_monitor.log
          else
            echo "No resource monitor log found"
          fi
          echo "==============================="

      - name: Collect Debug Information
        if: always()
        run: |
          echo "=== COLLECTING DEBUG INFORMATION ==="

          # System information
          echo "Final system state:"
          uptime
          free -h
          df -h

          # Process information
          echo "All running processes:"
          ps aux --sort=-%cpu | head -20

          # Check for core dumps
          echo "Checking for core dumps:"
          find . -name "core*" -type f 2>/dev/null || echo "No core dumps found"

          # Check for any crash logs
          echo "Checking for crash logs:"
          find . -name "*crash*" -type f 2>/dev/null || echo "No crash logs found"

          # Check kernel messages for OOM kills
          echo "Checking for OOM kills in kernel messages:"
          sudo dmesg | grep -i "killed process" | tail -10 || echo "No OOM kills found"

          # Check evaluation output
          echo "Last 100 lines of evaluation output:"
          if [ -f eval_output.log ]; then
            tail -100 eval_output.log
          else
            echo "No evaluation output log found"
          fi

          # Check for saved trajectories
          echo "Saved trajectories directory:"
          if [ -d saved_trajectories ]; then
            find saved_trajectories -type f -name "*.json" | head -10
            echo "Total trajectory files: $(find saved_trajectories -type f -name "*.json" | wc -l)"
          else
            echo "No saved_trajectories directory found"
          fi

          echo "===================================="

      - name: Upload Debug Artifacts
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: debug-logs-${{ github.run_id }}
          path: |
            eval_output.log
            resource_monitor.log
            saved_trajectories/
          retention-days: 7

      - name: Final Status Summary
        if: always()
        run: |
          echo "=== FINAL STATUS SUMMARY ==="
          echo "Workflow run ID: ${{ github.run_id }}"
          echo "Job completion time: $(date)"
          echo "Evaluation step status: ${{ steps.run_eval.outcome }}"

          if [ "${{ steps.run_eval.outcome }}" != "success" ]; then
            echo "❌ Evaluation failed or was interrupted"
            echo "Check the debug artifacts and logs above for more information"
          else
            echo "✅ Evaluation completed successfully"
          fi

          echo "==========================="