mirror of
https://github.com/browser-use/browser-use
synced 2026-05-13 17:56:35 +02:00
134 lines
5.4 KiB
YAML
134 lines
5.4 KiB
YAML
name: Run Evaluation Script
|
|
|
|
on:
|
|
repository_dispatch:
|
|
types: [run-eval]
|
|
|
|
jobs:
|
|
run_evaluation:
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 360
|
|
env:
|
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
|
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
|
|
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
|
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
|
|
EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }}
|
|
EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}
|
|
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: "new-eval"
|
|
|
|
- name: Set up Python and uv
|
|
uses: astral-sh/setup-uv@v6
|
|
with:
|
|
enable-cache: true
|
|
activate-environment: true
|
|
|
|
- name: Install dependencies
|
|
run: uv sync
|
|
|
|
- name: Detect installed Playwright version
|
|
id: playwright_version
|
|
run: echo "VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_OUTPUT
|
|
|
|
- name: Cache Playwright browsers
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: ~/.cache/ms-playwright
|
|
key: ${{ runner.os }}-playwright-${{ steps.playwright_version.outputs.VERSION }}
|
|
restore-keys: |
|
|
${{ runner.os }}-playwright-
|
|
|
|
- name: Install Playwright browser dependencies
|
|
run: playwright install --no-shell chromium
|
|
|
|
- name: Install Xvfb for headed mode
|
|
if: github.event.client_payload.script_args.headless == 'false'
|
|
run: |
|
|
sudo apt-get update
|
|
sudo apt-get install -y xvfb
|
|
|
|
- name: Construct eval command
|
|
id: eval_command
|
|
run: |
|
|
# Extract values from client_payload.script_args, applying defaults if not provided
|
|
MODEL_PAYLOAD="${{ github.event.client_payload.script_args.model }}"
|
|
EVAL_MODEL_PAYLOAD="${{ github.event.client_payload.script_args.eval_model }}"
|
|
PARALLEL_RUNS_PAYLOAD="${{ github.event.client_payload.script_args.parallel_runs }}"
|
|
MAX_STEPS_PAYLOAD="${{ github.event.client_payload.script_args.max_steps }}"
|
|
START_INDEX_PAYLOAD="${{ github.event.client_payload.script_args.start_index }}"
|
|
END_INDEX_PAYLOAD="${{ github.event.client_payload.script_args.end_index }}"
|
|
NO_VISION_PAYLOAD="${{ github.event.client_payload.script_args.no_vision }}"
|
|
HEADLESS_PAYLOAD="${{ github.event.client_payload.script_args.headless }}"
|
|
FRESH_START_PAYLOAD="${{ github.event.client_payload.script_args.fresh_start }}"
|
|
EVAL_GROUP_PAYLOAD="${{ github.event.client_payload.script_args.eval_group }}"
|
|
USER_MESSAGE_PAYLOAD="${{ github.event.client_payload.script_args.user_message }}"
|
|
DEVELOPER_ID_PAYLOAD="${{ github.event.client_payload.script_args.developer_id }}"
|
|
|
|
# Apply defaults for string/numeric types
|
|
MODEL="${MODEL_PAYLOAD:-llama-4-maverick}"
|
|
EVAL_MODEL="${EVAL_MODEL_PAYLOAD:-gpt-4o}"
|
|
PARALLEL_RUNS="${PARALLEL_RUNS_PAYLOAD:-2}"
|
|
MAX_STEPS="${MAX_STEPS_PAYLOAD:-25}"
|
|
START_INDEX="${START_INDEX_PAYLOAD:-0}"
|
|
EVAL_GROUP="${EVAL_GROUP_PAYLOAD:-PRTests}"
|
|
USER_MESSAGE="${USER_MESSAGE_PAYLOAD}"
|
|
DEVELOPER_ID="${DEVELOPER_ID_PAYLOAD}"
|
|
FRESH_START_VALUE="${FRESH_START_PAYLOAD:-true}" # Default for fresh_start is true
|
|
|
|
CMD="python eval/service.py \
|
|
--model \"$MODEL\" \
|
|
--eval-model \"$EVAL_MODEL\" \
|
|
--parallel-runs \"$PARALLEL_RUNS\" \
|
|
--max-steps \"$MAX_STEPS\" \
|
|
--start \"$START_INDEX\""
|
|
|
|
# Handle optional end_index with its default
|
|
if [[ -n "$END_INDEX_PAYLOAD" ]]; then
|
|
CMD="$CMD --end \"$END_INDEX_PAYLOAD\""
|
|
else
|
|
CMD="$CMD --end 100" # Default end_index if not in client_payload
|
|
fi
|
|
|
|
# Handle boolean flags (only add if true or default to true)
|
|
if [[ "$NO_VISION_PAYLOAD" == "true" ]]; then
|
|
CMD="$CMD --no-vision"
|
|
fi
|
|
|
|
# --headless is action='store_true', default for input was true.
|
|
# So, add --headless if payload is "true" or if payload is empty (use default "true")
|
|
HEADLESS_EFFECTIVE_VALUE="${HEADLESS_PAYLOAD:-true}"
|
|
if [[ "$HEADLESS_EFFECTIVE_VALUE" == "true" ]]; then
|
|
CMD="$CMD --headless"
|
|
fi
|
|
|
|
CMD="$CMD --fresh-start $FRESH_START_VALUE \
|
|
--eval-group \"$EVAL_GROUP\" \
|
|
|
|
# Only add user-message if it was provided in the payload
|
|
if [[ -n "$USER_MESSAGE_PAYLOAD" ]]; then
|
|
CMD="$CMD --user-message \"$USER_MESSAGE_PAYLOAD\""
|
|
fi
|
|
|
|
# Only add developer-id if it was provided in the payload
|
|
if [[ -n "$DEVELOPER_ID_PAYLOAD" ]]; then
|
|
CMD="$CMD --developer-id \"$DEVELOPER_ID_PAYLOAD\""
|
|
fi
|
|
|
|
# Prepend xvfb-run if running in headed mode
|
|
if [[ "$HEADLESS_EFFECTIVE_VALUE" == "false" ]]; then
|
|
CMD="xvfb-run --auto-servernum --server-args='-screen 0 1280x1024x24' $CMD"
|
|
fi
|
|
|
|
echo "FULL_COMMAND=$CMD" >> $GITHUB_OUTPUT
|
|
echo "::notice title=Eval Command::$CMD"
|
|
|
|
- name: Run evaluation script
|
|
run: ${{ steps.eval_command.outputs.FULL_COMMAND }}
|