mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
657 lines
30 KiB
YAML
657 lines
30 KiB
YAML
name: Run Evaluation Script
|
|
|
|
on:
|
|
repository_dispatch:
|
|
types: [run-eval]
|
|
workflow_dispatch:
|
|
|
|
jobs:
|
|
run_evaluation:
|
|
runs-on:
|
|
group: eval
|
|
labels: eval-2-core-500
|
|
timeout-minutes: 60 #
|
|
env:
|
|
IN_DOCKER: 'true'
|
|
ANONYMIZED_TELEMETRY: 'false'
|
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
|
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
|
|
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
|
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
|
|
EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }}
|
|
EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}
|
|
ANCHOR_BROWSER_API_KEY: ${{ secrets.ANCHOR_BROWSER_API_KEY }}
|
|
BRIGHTDATA_CDP_URL: ${{ secrets.BRIGHTDATA_CDP_URL }}
|
|
HYPERBROWSER_API_KEY: ${{ secrets.HYPERBROWSER_API_KEY }}
|
|
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
|
|
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
|
|
SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
|
|
LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }}
|
|
BROWSER_USE_LOGGING_LEVEL: ${{ github.event.client_payload.script_args.browser_logging_level || secrets.BROWSER_USE_LOGGING_LEVEL || 'info' }}
|
|
EVAL_START_INDEX: ${{ github.event.client_payload.script_args.start_index }}
|
|
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
|
|
|
|
steps:
|
|
- name: System Info and Resource Check
|
|
run: |
|
|
echo "=== SYSTEM INFORMATION ==="
|
|
echo "Runner OS: $(uname -a)"
|
|
echo "CPU Info: $(nproc) cores"
|
|
echo "Memory Info:"
|
|
free -h
|
|
echo "Disk Space:"
|
|
df -h
|
|
echo "Load Average:"
|
|
uptime
|
|
echo "=========================="
|
|
|
|
- name: Determine ref to checkout
|
|
id: determine_ref
|
|
run: |
|
|
# Use the ref from client_payload or default to main
|
|
# This can be a branch name, tag, commit SHA, or any valid Git ref
|
|
REF="${{ github.event.client_payload.ref }}"
|
|
REF="${REF:-main}"
|
|
echo "REF=$REF" >> $GITHUB_OUTPUT
|
|
echo "::notice title=Ref Selection::Will checkout and run evaluation on ref: $REF"
|
|
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ steps.determine_ref.outputs.REF }}
|
|
|
|
- name: Set up Python and uv
|
|
uses: astral-sh/setup-uv@v6
|
|
with:
|
|
enable-cache: true
|
|
activate-environment: true
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
echo "Installing dependencies..."
|
|
uv sync --extra eval
|
|
echo "Dependencies installed successfully"
|
|
|
|
- name: Detect installed Playwright version
|
|
id: playwright_version
|
|
run: echo "VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_OUTPUT
|
|
|
|
- name: Cache Playwright browsers
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: ~/.cache/ms-playwright
|
|
key: ${{ runner.os }}-playwright-${{ steps.playwright_version.outputs.VERSION }}
|
|
restore-keys: |
|
|
${{ runner.os }}-playwright-
|
|
|
|
- name: Install Playwright browser dependencies
|
|
run: |
|
|
echo "Installing Playwright browsers..."
|
|
playwright install --no-shell chromium
|
|
echo "Playwright browsers installed successfully"
|
|
|
|
- name: Install Xvfb for headed mode
|
|
if: github.event.client_payload.script_args.headless == 'false'
|
|
run: |
|
|
echo "Installing Xvfb for headed mode..."
|
|
sudo apt-get update
|
|
sudo apt-get install -y xvfb
|
|
echo "Xvfb installed successfully"
|
|
|
|
- name: Pre-execution Resource Check
|
|
run: |
|
|
echo "=== PRE-EXECUTION RESOURCE CHECK ==="
|
|
echo "Memory usage:"
|
|
free -h
|
|
echo "CPU load:"
|
|
uptime
|
|
echo "Disk usage:"
|
|
df -h
|
|
echo "Process count:"
|
|
ps aux | wc -l
|
|
echo "================================="
|
|
|
|
- name: Construct GitHub Workflow URL
|
|
id: github_url
|
|
run: |
|
|
GITHUB_WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
echo "GITHUB_WORKFLOW_URL=$GITHUB_WORKFLOW_URL" >> $GITHUB_OUTPUT
|
|
echo "::notice title=Workflow URL::Workflow URL: $GITHUB_WORKFLOW_URL"
|
|
|
|
- name: Register Runner Progress
|
|
id: register_runner
|
|
run: |
|
|
echo "Registering runner progress..."
|
|
|
|
# Extract parameters for runner identification with validation
|
|
RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
|
|
GITHUB_RUN_ID="${{ github.run_id }}"
|
|
START_INDEX="${{ github.event.client_payload.script_args.start_index }}"
|
|
END_INDEX="${{ github.event.client_payload.script_args.end_index }}"
|
|
TOTAL_TASKS="${{ github.event.client_payload.script_args.total_tasks }}"
|
|
GITHUB_WORKFLOW_URL="${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}"
|
|
|
|
# Validate and set defaults for numeric variables to prevent arithmetic errors
|
|
START_INDEX="${START_INDEX:-0}"
|
|
TOTAL_TASKS="${TOTAL_TASKS:-100}"
|
|
|
|
# Validate START_INDEX is numeric
|
|
if ! [[ "$START_INDEX" =~ ^[0-9]+$ ]]; then
|
|
echo "⚠️ Warning: START_INDEX '$START_INDEX' is not numeric, defaulting to 0"
|
|
START_INDEX=0
|
|
fi
|
|
|
|
# Validate TOTAL_TASKS is numeric
|
|
if ! [[ "$TOTAL_TASKS" =~ ^[0-9]+$ ]]; then
|
|
echo "⚠️ Warning: TOTAL_TASKS '$TOTAL_TASKS' is not numeric, defaulting to 100"
|
|
TOTAL_TASKS=100
|
|
fi
|
|
|
|
# Calculate task range for this runner with safe arithmetic
|
|
if [ -n "$END_INDEX" ] && [ "$END_INDEX" != "null" ] && [[ "$END_INDEX" =~ ^[0-9]+$ ]]; then
|
|
TASK_RANGE="${START_INDEX}-${END_INDEX}"
|
|
TOTAL_ASSIGNED_TASKS=$((END_INDEX - START_INDEX))
|
|
else
|
|
TASK_RANGE="${START_INDEX}+"
|
|
TOTAL_ASSIGNED_TASKS=$((TOTAL_TASKS - START_INDEX))
|
|
fi
|
|
|
|
# Ensure TOTAL_ASSIGNED_TASKS is positive
|
|
if [ "$TOTAL_ASSIGNED_TASKS" -le 0 ]; then
|
|
echo "⚠️ Warning: Calculated TOTAL_ASSIGNED_TASKS is $TOTAL_ASSIGNED_TASKS, setting to 1"
|
|
TOTAL_ASSIGNED_TASKS=1
|
|
fi
|
|
|
|
# Generate consistent runner ID (matches Python service pattern)
|
|
RUNNER_ID="github_run_${GITHUB_RUN_ID}_batch_${START_INDEX}"
|
|
|
|
echo "Runner ID: $RUNNER_ID"
|
|
echo "Task Range: $TASK_RANGE"
|
|
echo "Total Assigned Tasks: $TOTAL_ASSIGNED_TASKS"
|
|
echo "Start Index: $START_INDEX (validated)"
|
|
echo "End Index: $END_INDEX"
|
|
echo "Total Tasks: $TOTAL_TASKS (validated)"
|
|
|
|
# Send registration to progress tracking API
|
|
curl -X POST "${{ secrets.EVALUATION_TOOL_URL }}/api/saveRunnerProgress" \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}" \
|
|
-d "{
|
|
\"runId\": \"${RUN_ID}\",
|
|
\"runnerId\": \"${RUNNER_ID}\",
|
|
\"githubWorkflowRunId\": \"${GITHUB_RUN_ID}\",
|
|
\"githubWorkflowUrl\": \"${GITHUB_WORKFLOW_URL}\",
|
|
\"taskId\": \"batch_${START_INDEX}\",
|
|
\"currentStage\": \"starting\",
|
|
\"status\": \"active\",
|
|
\"assignedTaskRange\": \"${TASK_RANGE}\",
|
|
\"totalAssignedTasks\": ${TOTAL_ASSIGNED_TASKS}
|
|
}" \
|
|
--max-time 10 \
|
|
--retry 2 || echo "⚠️ Runner registration failed, but continuing evaluation"
|
|
|
|
- name: Construct eval command
|
|
id: eval_command
|
|
run: |
|
|
# Centralized defaults
|
|
DEFAULT_MODEL="llama-4-maverick"
|
|
DEFAULT_EVAL_MODEL="gpt-4o"
|
|
DEFAULT_PARALLEL_RUNS="2"
|
|
DEFAULT_MAX_STEPS="25"
|
|
DEFAULT_START_INDEX="0"
|
|
DEFAULT_END_INDEX="100"
|
|
DEFAULT_EVAL_GROUP="PRTests"
|
|
DEFAULT_HEADLESS="true"
|
|
DEFAULT_THINKING="true"
|
|
|
|
DEFAULT_MEMORY_INTERVAL="10"
|
|
DEFAULT_MAX_ACTIONS_PER_STEP="10"
|
|
DEFAULT_PLANNER_INTERVAL="1"
|
|
DEFAULT_JUDGE_REPEAT_COUNT="1"
|
|
DEFAULT_TEST_CASE="OnlineMind2Web"
|
|
|
|
# Extract and apply defaults using parameter expansion
|
|
MODEL="${{ github.event.client_payload.script_args.model }}"
|
|
MODEL="${MODEL:-$DEFAULT_MODEL}"
|
|
|
|
EVAL_MODEL="${{ github.event.client_payload.script_args.eval_model }}"
|
|
EVAL_MODEL="${EVAL_MODEL:-$DEFAULT_EVAL_MODEL}"
|
|
|
|
PARALLEL_RUNS="${{ github.event.client_payload.script_args.parallel_runs }}"
|
|
PARALLEL_RUNS="${PARALLEL_RUNS:-$DEFAULT_PARALLEL_RUNS}"
|
|
|
|
MAX_STEPS="${{ github.event.client_payload.script_args.max_steps }}"
|
|
MAX_STEPS="${MAX_STEPS:-$DEFAULT_MAX_STEPS}"
|
|
|
|
START_INDEX="${{ github.event.client_payload.script_args.start_index }}"
|
|
START_INDEX="${START_INDEX:-$DEFAULT_START_INDEX}"
|
|
|
|
END_INDEX="${{ github.event.client_payload.script_args.end_index }}"
|
|
END_INDEX="${END_INDEX:-$DEFAULT_END_INDEX}"
|
|
|
|
EVAL_GROUP="${{ github.event.client_payload.script_args.eval_group }}"
|
|
EVAL_GROUP="${EVAL_GROUP:-$DEFAULT_EVAL_GROUP}"
|
|
|
|
HEADLESS="${{ github.event.client_payload.script_args.headless }}"
|
|
HEADLESS="${HEADLESS:-$DEFAULT_HEADLESS}"
|
|
|
|
THINKING="${{ github.event.client_payload.script_args.thinking }}"
|
|
THINKING="${THINKING:-$DEFAULT_THINKING}"
|
|
|
|
MEMORY_INTERVAL="${{ github.event.client_payload.script_args.memory_interval }}"
|
|
MEMORY_INTERVAL="${MEMORY_INTERVAL:-$DEFAULT_MEMORY_INTERVAL}"
|
|
|
|
MAX_ACTIONS_PER_STEP="${{ github.event.client_payload.script_args.max_actions_per_step }}"
|
|
MAX_ACTIONS_PER_STEP="${MAX_ACTIONS_PER_STEP:-$DEFAULT_MAX_ACTIONS_PER_STEP}"
|
|
|
|
PLANNER_INTERVAL="${{ github.event.client_payload.script_args.planner_interval }}"
|
|
PLANNER_INTERVAL="${PLANNER_INTERVAL:-$DEFAULT_PLANNER_INTERVAL}"
|
|
|
|
JUDGE_REPEAT_COUNT="${{ github.event.client_payload.script_args.judge_repeat_count }}"
|
|
JUDGE_REPEAT_COUNT="${JUDGE_REPEAT_COUNT:-$DEFAULT_JUDGE_REPEAT_COUNT}"
|
|
|
|
TEST_CASE="${{ github.event.client_payload.script_args.test_case }}"
|
|
TEST_CASE="${TEST_CASE:-$DEFAULT_TEST_CASE}"
|
|
|
|
# Optional parameters (no defaults)
|
|
USER_MESSAGE="${{ github.event.client_payload.script_args.user_message }}"
|
|
DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}"
|
|
PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}"
|
|
BROWSER="${{ github.event.client_payload.script_args.browser }}"
|
|
IMAGES_PER_STEP="${{ github.event.client_payload.script_args.images_per_step }}"
|
|
RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
|
|
LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}"
|
|
# Pass raw GitHub Actions object to Python - no parsing in bash
|
|
GMAIL_2FA_TOKENS_RAW='${{ github.event.client_payload.script_args.gmail_2fa_tokens }}'
|
|
|
|
# Browser timeout and stealth parameters
|
|
DEFAULT_NAVIGATION_TIMEOUT="${{ github.event.client_payload.script_args.default_navigation_timeout }}"
|
|
DEFAULT_TIMEOUT="${{ github.event.client_payload.script_args.default_timeout }}"
|
|
MINIMUM_WAIT_PAGE_LOAD_TIME="${{ github.event.client_payload.script_args.minimum_wait_page_load_time }}"
|
|
WAIT_FOR_NETWORK_IDLE_PAGE_LOAD_TIME="${{ github.event.client_payload.script_args.wait_for_network_idle_page_load_time }}"
|
|
MAXIMUM_WAIT_PAGE_LOAD_TIME="${{ github.event.client_payload.script_args.maximum_wait_page_load_time }}"
|
|
WAIT_BETWEEN_ACTIONS="${{ github.event.client_payload.script_args.wait_between_actions }}"
|
|
STEALTH="${{ github.event.client_payload.script_args.stealth }}"
|
|
BLOCK_IMAGES="${{ github.event.client_payload.script_args.block_images }}"
|
|
BLOCK_CSS="${{ github.event.client_payload.script_args.block_css }}"
|
|
|
|
# Single task mode parameters
|
|
TASK_ID="${{ github.event.client_payload.script_args.task_id }}"
|
|
TASK_TEXT="${{ github.event.client_payload.script_args.task_text }}"
|
|
TASK_WEBSITE="${{ github.event.client_payload.script_args.task_website }}"
|
|
|
|
# Build command using array for cleaner construction
|
|
CMD_ARGS=(
|
|
"python" "eval/service.py"
|
|
"--model" "$MODEL"
|
|
"--eval-model" "$EVAL_MODEL"
|
|
"--parallel-runs" "$PARALLEL_RUNS"
|
|
"--max-steps" "$MAX_STEPS"
|
|
"--start" "$START_INDEX"
|
|
"--end" "$END_INDEX"
|
|
|
|
"--eval-group" "$EVAL_GROUP"
|
|
"--memory-interval" "$MEMORY_INTERVAL"
|
|
"--max-actions-per-step" "$MAX_ACTIONS_PER_STEP"
|
|
"--planner-interval" "$PLANNER_INTERVAL"
|
|
"--judge-repeat-count" "$JUDGE_REPEAT_COUNT"
|
|
"--test-case" "$TEST_CASE"
|
|
)
|
|
|
|
# Add boolean flags conditionally
|
|
[[ "${{ github.event.client_payload.script_args.no_vision }}" == "true" ]] && CMD_ARGS+=("--no-vision")
|
|
[[ "$HEADLESS" == "true" ]] && CMD_ARGS+=("--headless")
|
|
[[ "${{ github.event.client_payload.script_args.use_serp }}" == "true" ]] && CMD_ARGS+=("--use-serp")
|
|
[[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory")
|
|
[[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output")
|
|
[[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result")
|
|
[[ "${{ github.event.client_payload.script_args.highlight_elements }}" == "false" ]] && CMD_ARGS+=("--no-highlight-elements")
|
|
[[ "${{ github.event.client_payload.script_args.use_mind2web_judge }}" == "true" ]] && CMD_ARGS+=("--use-mind2web-judge")
|
|
[[ "$THINKING" == "false" ]] && CMD_ARGS+=("--no-thinking")
|
|
[[ "$STEALTH" == "true" ]] && CMD_ARGS+=("--stealth")
|
|
[[ "$BLOCK_IMAGES" == "true" ]] && CMD_ARGS+=("--block-images")
|
|
[[ "$BLOCK_CSS" == "true" ]] && CMD_ARGS+=("--block-css")
|
|
|
|
# Add optional string parameters
|
|
[[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE")
|
|
[[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID")
|
|
[[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL")
|
|
[[ -n "$BROWSER" ]] && CMD_ARGS+=("--browser" "$BROWSER")
|
|
[[ -n "$IMAGES_PER_STEP" ]] && CMD_ARGS+=("--images-per-step" "$IMAGES_PER_STEP")
|
|
[[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID")
|
|
[[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID")
|
|
[[ -n "$BROWSER_LOGGING_LEVEL" ]] && CMD_ARGS+=("--browser-logging-level" "$BROWSER_LOGGING_LEVEL")
|
|
|
|
# Add browser timeout parameters
|
|
[[ -n "$DEFAULT_NAVIGATION_TIMEOUT" ]] && CMD_ARGS+=("--default-navigation-timeout" "$DEFAULT_NAVIGATION_TIMEOUT")
|
|
[[ -n "$DEFAULT_TIMEOUT" ]] && CMD_ARGS+=("--default-timeout" "$DEFAULT_TIMEOUT")
|
|
[[ -n "$MINIMUM_WAIT_PAGE_LOAD_TIME" ]] && CMD_ARGS+=("--minimum-wait-page-load-time" "$MINIMUM_WAIT_PAGE_LOAD_TIME")
|
|
[[ -n "$WAIT_FOR_NETWORK_IDLE_PAGE_LOAD_TIME" ]] && CMD_ARGS+=("--wait-for-network-idle-page-load-time" "$WAIT_FOR_NETWORK_IDLE_PAGE_LOAD_TIME")
|
|
[[ -n "$MAXIMUM_WAIT_PAGE_LOAD_TIME" ]] && CMD_ARGS+=("--maximum-wait-page-load-time" "$MAXIMUM_WAIT_PAGE_LOAD_TIME")
|
|
[[ -n "$WAIT_BETWEEN_ACTIONS" ]] && CMD_ARGS+=("--wait-between-actions" "$WAIT_BETWEEN_ACTIONS")
|
|
|
|
if [[ -n "$GMAIL_2FA_TOKENS_RAW" && "$GMAIL_2FA_TOKENS_RAW" != "null" ]]; then
|
|
echo "DEBUG: ✅ GMAIL_2FA_TOKENS_RAW conditional passed - adding tokens to command"
|
|
CMD_ARGS+=("--gmail-2fa-tokens" "$GMAIL_2FA_TOKENS_RAW")
|
|
else
|
|
echo "DEBUG: ❌ GMAIL_2FA_TOKENS_RAW conditional failed - tokens are empty, null, or not set, skipping"
|
|
fi
|
|
|
|
# Add single task mode parameters
|
|
[[ -n "$TASK_ID" ]] && CMD_ARGS+=("--task-id" "$TASK_ID")
|
|
[[ -n "$TASK_TEXT" ]] && CMD_ARGS+=("--task-text" "$TASK_TEXT")
|
|
[[ -n "$TASK_WEBSITE" ]] && CMD_ARGS+=("--task-website" "$TASK_WEBSITE")
|
|
|
|
# Add GitHub workflow URL
|
|
[[ -n "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}" ]] && CMD_ARGS+=("--github-workflow-url" "${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}")
|
|
|
|
# Handle xvfb wrapper if needed - construct differently to preserve arguments
|
|
if [[ "$HEADLESS" == "false" ]]; then
|
|
# For xvfb, we need to construct the command differently to preserve argument structure
|
|
XVFB_ARGS=("xvfb-run" "--auto-servernum" "--server-args=-screen 0 1920x1080x24")
|
|
FULL_CMD_ARGS=("${XVFB_ARGS[@]}" "${CMD_ARGS[@]}")
|
|
printf -v CMD_STRING '%q ' "${FULL_CMD_ARGS[@]}"
|
|
else
|
|
# For headless mode, just escape the python command args
|
|
printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"
|
|
fi
|
|
|
|
echo "FULL_COMMAND=$CMD_STRING" >> $GITHUB_OUTPUT
|
|
echo "::notice title=Eval Command::$CMD_STRING"
|
|
|
|
|
|
- name: Start Resource Monitoring
|
|
run: |
|
|
echo "Starting background resource monitoring..."
|
|
# Create a background script that monitors resources every 30 seconds
|
|
cat > monitor_resources.sh << 'EOF'
|
|
#!/bin/bash
|
|
while true; do
|
|
echo "=== RESOURCE MONITOR $(date) ==="
|
|
echo "Memory:"
|
|
free -h
|
|
echo "CPU Load:"
|
|
uptime
|
|
echo "Top processes by CPU:"
|
|
ps aux --sort=-%cpu | head -10
|
|
echo "Top processes by Memory:"
|
|
ps aux --sort=-%mem | head -10
|
|
echo "Chrome/Chromium processes:"
|
|
ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
|
|
echo "Python processes:"
|
|
ps aux | grep python | grep -v grep || echo "No Python processes found"
|
|
echo "=================================="
|
|
sleep 30
|
|
done
|
|
EOF
|
|
chmod +x monitor_resources.sh
|
|
# Start the monitor in background and save PID
|
|
nohup ./monitor_resources.sh > resource_monitor.log 2>&1 &
|
|
echo $! > monitor_pid.txt
|
|
echo "Resource monitoring started with PID: $(cat monitor_pid.txt)"
|
|
|
|
- name: Run evaluation
|
|
id: run_eval
|
|
run: |
|
|
echo "=== STARTING EVALUATION ==="
|
|
echo "🚀 Starting evaluation with command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
|
|
echo "📊 This may take several minutes depending on the number of tasks..."
|
|
echo "Starting time: $(date)"
|
|
echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
|
|
|
|
|
|
# Set up signal handlers for graceful interruption handling
|
|
set -e
|
|
trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log 2>/dev/null || echo "No output log available"; exit 130' INT TERM
|
|
|
|
# Run the evaluation with output capture and comprehensive error handling
|
|
set +e # Don't exit on errors, we want to capture them for debugging
|
|
${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
|
|
EVAL_EXIT_CODE=${PIPESTATUS[0]}
|
|
set -e # Re-enable exit on error
|
|
|
|
echo "=== EVALUATION COMPLETED ==="
|
|
echo "✅ Evaluation finished"
|
|
echo "Exit code: $EVAL_EXIT_CODE"
|
|
echo "Completion time: $(date)"
|
|
echo "============================"
|
|
|
|
# Show last part of log for immediate context in GitHub Actions
|
|
if [ $EVAL_EXIT_CODE -ne 0 ]; then
|
|
echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
|
|
tail -100 eval_output.log 2>/dev/null || echo "No output log available for debugging"
|
|
echo "===================================================="
|
|
echo "❌ Evaluation failed with exit code $EVAL_EXIT_CODE"
|
|
echo "📋 Full debug information will be available in artifacts"
|
|
else
|
|
echo "✅ Evaluation completed successfully"
|
|
echo "📊 Last 20 lines of output for verification:"
|
|
tail -20 eval_output.log 2>/dev/null || echo "No output log available"
|
|
fi
|
|
|
|
# Ensure eval_output.log exists for artifact collection
|
|
if [ ! -f eval_output.log ]; then
|
|
echo "⚠️ Warning: eval_output.log was not created" > eval_output.log
|
|
echo "This may indicate the evaluation command failed to start properly" >> eval_output.log
|
|
echo "Check the GitHub Actions logs above for more information" >> eval_output.log
|
|
fi
|
|
|
|
# Exit with the original evaluation exit code to properly propagate failures
|
|
exit $EVAL_EXIT_CODE
|
|
env:
|
|
EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }}
|
|
EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}
|
|
EVAL_START_INDEX: ${{ github.event.client_payload.script_args.start_index }}
|
|
|
|
- name: Post-execution Resource Check
|
|
if: always()
|
|
run: |
|
|
echo "=== POST-EXECUTION RESOURCE CHECK ==="
|
|
echo "Memory usage:"
|
|
free -h
|
|
echo "CPU load:"
|
|
uptime
|
|
echo "Disk usage:"
|
|
df -h
|
|
echo "Process count:"
|
|
ps aux | wc -l
|
|
echo "Chrome/Chromium processes still running:"
|
|
ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
|
|
echo "Python processes still running:"
|
|
ps aux | grep python | grep -v grep || echo "No Python processes found"
|
|
echo "==================================="
|
|
|
|
- name: Stop Resource Monitoring and Collect Logs
|
|
if: always()
|
|
run: |
|
|
echo "Stopping resource monitoring..."
|
|
if [ -f monitor_pid.txt ]; then
|
|
MONITOR_PID=$(cat monitor_pid.txt)
|
|
if kill -0 $MONITOR_PID 2>/dev/null; then
|
|
kill $MONITOR_PID
|
|
echo "Resource monitor stopped"
|
|
else
|
|
echo "Resource monitor was already stopped"
|
|
fi
|
|
fi
|
|
|
|
echo "=== RESOURCE MONITORING LOG ==="
|
|
if [ -f resource_monitor.log ]; then
|
|
tail -100 resource_monitor.log
|
|
else
|
|
echo "No resource monitor log found"
|
|
fi
|
|
echo "==============================="
|
|
|
|
- name: Collect Debug Information
|
|
if: always()
|
|
run: |
|
|
echo "=== COLLECTING DEBUG INFORMATION ==="
|
|
|
|
# System information
|
|
echo "Final system state:"
|
|
uptime
|
|
free -h
|
|
df -h
|
|
|
|
# Process information
|
|
echo "All running processes:"
|
|
ps aux --sort=-%cpu | head -20
|
|
|
|
# Check for core dumps
|
|
echo "Checking for core dumps:"
|
|
find . -name "core*" -type f 2>/dev/null || echo "No core dumps found"
|
|
|
|
# Check for any crash logs
|
|
echo "Checking for crash logs:"
|
|
find . -name "*crash*" -type f 2>/dev/null || echo "No crash logs found"
|
|
|
|
# Check kernel messages for OOM kills
|
|
echo "Checking for OOM kills in kernel messages:"
|
|
sudo dmesg | grep -i "killed process" | tail -10 || echo "No OOM kills found"
|
|
|
|
# Check evaluation output (now properly captured)
|
|
echo "Last 100 lines of evaluation output:"
|
|
if [ -f eval_output.log ]; then
|
|
echo "📄 eval_output.log found ($(wc -l < eval_output.log) lines total)"
|
|
tail -100 eval_output.log
|
|
else
|
|
echo "⚠️ No evaluation output log found - this indicates a critical failure"
|
|
echo "The evaluation may have failed to start or crashed immediately"
|
|
fi
|
|
|
|
# Show evaluation log statistics
|
|
if [ -f eval_output.log ]; then
|
|
echo "=== EVALUATION LOG STATISTICS ==="
|
|
echo "Total lines: $(wc -l < eval_output.log)"
|
|
echo "Log size: $(ls -lh eval_output.log | awk '{print $5}')"
|
|
echo "Error indicators:"
|
|
grep -c -i "error\|exception\|failed\|traceback" eval_output.log 2>/dev/null || echo "0 error patterns found"
|
|
echo "Task completion indicators:"
|
|
grep -c -i "task.*completed\|evaluation.*complete" eval_output.log 2>/dev/null || echo "0 completion patterns found"
|
|
echo "================================"
|
|
fi
|
|
|
|
# Check for saved trajectories
|
|
echo "Saved trajectories directory:"
|
|
if [ -d saved_trajectories ]; then
|
|
find saved_trajectories -type f -name "*.json" | head -10
|
|
echo "Total trajectory files: $(find saved_trajectories -type f -name "*.json" | wc -l)"
|
|
else
|
|
echo "No saved_trajectories directory found"
|
|
fi
|
|
|
|
echo "===================================="
|
|
|
|
- name: Upload Debug Artifacts
|
|
if: always()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: debug-logs-${{ github.run_id }}
|
|
path: |
|
|
eval_output.log
|
|
resource_monitor.log
|
|
saved_trajectories/
|
|
retention-days: 7
|
|
|
|
- name: Final Status Summary
|
|
if: always()
|
|
run: |
|
|
echo "=== FINAL STATUS SUMMARY ==="
|
|
echo "Workflow run ID: ${{ github.run_id }}"
|
|
echo "Job completion time: $(date)"
|
|
echo "Evaluation step status: ${{ steps.run_eval.outcome }}"
|
|
|
|
if [ "${{ steps.run_eval.outcome }}" != "success" ]; then
|
|
echo "❌ Evaluation failed or was interrupted"
|
|
echo "Check the debug artifacts and logs above for more information"
|
|
else
|
|
echo "✅ Evaluation completed successfully"
|
|
fi
|
|
|
|
echo "==========================="
|
|
|
|
- name: Update Runner Completion Status
|
|
if: always()
|
|
run: |
|
|
echo "Updating runner completion status..."
|
|
|
|
# Extract parameters for runner identification with validation
|
|
RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
|
|
GITHUB_RUN_ID="${{ github.run_id }}"
|
|
START_INDEX="${{ github.event.client_payload.script_args.start_index }}"
|
|
END_INDEX="${{ github.event.client_payload.script_args.end_index }}"
|
|
TOTAL_TASKS="${{ github.event.client_payload.script_args.total_tasks }}"
|
|
GITHUB_WORKFLOW_URL="${{ steps.github_url.outputs.GITHUB_WORKFLOW_URL }}"
|
|
|
|
# Validate and set defaults for numeric variables to prevent arithmetic errors
|
|
START_INDEX="${START_INDEX:-0}"
|
|
TOTAL_TASKS="${TOTAL_TASKS:-100}"
|
|
|
|
# Validate START_INDEX is numeric
|
|
if ! [[ "$START_INDEX" =~ ^[0-9]+$ ]]; then
|
|
echo "⚠️ Warning: START_INDEX '$START_INDEX' is not numeric, defaulting to 0"
|
|
START_INDEX=0
|
|
fi
|
|
|
|
# Validate TOTAL_TASKS is numeric
|
|
if ! [[ "$TOTAL_TASKS" =~ ^[0-9]+$ ]]; then
|
|
echo "⚠️ Warning: TOTAL_TASKS '$TOTAL_TASKS' is not numeric, defaulting to 100"
|
|
TOTAL_TASKS=100
|
|
fi
|
|
|
|
# Calculate task range for this runner with safe arithmetic
|
|
if [ -n "$END_INDEX" ] && [ "$END_INDEX" != "null" ] && [[ "$END_INDEX" =~ ^[0-9]+$ ]]; then
|
|
TASK_RANGE="${START_INDEX}-${END_INDEX}"
|
|
TOTAL_ASSIGNED_TASKS=$((END_INDEX - START_INDEX))
|
|
else
|
|
TASK_RANGE="${START_INDEX}+"
|
|
TOTAL_ASSIGNED_TASKS=$((TOTAL_TASKS - START_INDEX))
|
|
fi
|
|
|
|
# Ensure TOTAL_ASSIGNED_TASKS is positive
|
|
if [ "$TOTAL_ASSIGNED_TASKS" -le 0 ]; then
|
|
echo "⚠️ Warning: Calculated TOTAL_ASSIGNED_TASKS is $TOTAL_ASSIGNED_TASKS, setting to 1"
|
|
TOTAL_ASSIGNED_TASKS=1
|
|
fi
|
|
|
|
# Generate consistent runner ID (matches Python service pattern)
|
|
RUNNER_ID="github_run_${GITHUB_RUN_ID}_batch_${START_INDEX}"
|
|
|
|
echo "Runner ID: $RUNNER_ID"
|
|
echo "Task Range: $TASK_RANGE"
|
|
echo "Total Assigned Tasks: $TOTAL_ASSIGNED_TASKS"
|
|
echo "Start Index: $START_INDEX (validated)"
|
|
echo "End Index: $END_INDEX"
|
|
echo "Total Tasks: $TOTAL_TASKS (validated)"
|
|
|
|
# Determine final status based on evaluation outcome
|
|
FINAL_STATUS="completed"
|
|
FINAL_STAGE="completed"
|
|
ERROR_MESSAGE=""
|
|
|
|
if [ "${{ steps.run_eval.outcome }}" != "success" ]; then
|
|
FINAL_STATUS="failed"
|
|
FINAL_STAGE="failed"
|
|
ERROR_MESSAGE="Evaluation step failed or was interrupted"
|
|
fi
|
|
|
|
# Send update to progress tracking API
|
|
curl -X POST "${{ secrets.EVALUATION_TOOL_URL }}/api/saveRunnerProgress" \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}" \
|
|
-d "{
|
|
\"runId\": \"${RUN_ID}\",
|
|
\"runnerId\": \"${RUNNER_ID}\",
|
|
\"githubWorkflowRunId\": \"${GITHUB_RUN_ID}\",
|
|
\"githubWorkflowUrl\": \"${GITHUB_WORKFLOW_URL}\",
|
|
\"taskId\": \"batch_${START_INDEX}\",
|
|
\"currentStage\": \"${FINAL_STAGE}\",
|
|
\"status\": \"${FINAL_STATUS}\",
|
|
\"errorMessage\": \"${ERROR_MESSAGE}\"
|
|
}" \
|
|
--max-time 10 \
|
|
--retry 2 || echo "⚠️ Runner completion update failed, but continuing"
|