Files
browser-use/.github/workflows/eval.yaml
2025-05-22 15:53:49 -07:00

134 lines
5.4 KiB
YAML

name: Run Evaluation Script
on:
repository_dispatch:
types: [run-eval]
jobs:
run_evaluation:
runs-on: ubuntu-latest
timeout-minutes: 360
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }}
EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: "new-eval"
- name: Set up Python and uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
activate-environment: true
- name: Install dependencies
run: uv sync
- name: Detect installed Playwright version
id: playwright_version
run: echo "VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_OUTPUT
- name: Cache Playwright browsers
uses: actions/cache@v4
with:
path: ~/.cache/ms-playwright
key: ${{ runner.os }}-playwright-${{ steps.playwright_version.outputs.VERSION }}
restore-keys: |
${{ runner.os }}-playwright-
- name: Install Playwright browser dependencies
run: playwright install --no-shell chromium
- name: Install Xvfb for headed mode
if: github.event.client_payload.script_args.headless == 'false'
run: |
sudo apt-get update
sudo apt-get install -y xvfb
- name: Construct eval command
id: eval_command
run: |
# Extract values from client_payload.script_args, applying defaults if not provided
MODEL_PAYLOAD="${{ github.event.client_payload.script_args.model }}"
EVAL_MODEL_PAYLOAD="${{ github.event.client_payload.script_args.eval_model }}"
PARALLEL_RUNS_PAYLOAD="${{ github.event.client_payload.script_args.parallel_runs }}"
MAX_STEPS_PAYLOAD="${{ github.event.client_payload.script_args.max_steps }}"
START_INDEX_PAYLOAD="${{ github.event.client_payload.script_args.start_index }}"
END_INDEX_PAYLOAD="${{ github.event.client_payload.script_args.end_index }}"
NO_VISION_PAYLOAD="${{ github.event.client_payload.script_args.no_vision }}"
HEADLESS_PAYLOAD="${{ github.event.client_payload.script_args.headless }}"
FRESH_START_PAYLOAD="${{ github.event.client_payload.script_args.fresh_start }}"
EVAL_GROUP_PAYLOAD="${{ github.event.client_payload.script_args.eval_group }}"
USER_MESSAGE_PAYLOAD="${{ github.event.client_payload.script_args.user_message }}"
DEVELOPER_ID_PAYLOAD="${{ github.event.client_payload.script_args.developer_id }}"
# Apply defaults for string/numeric types
MODEL="${MODEL_PAYLOAD:-llama-4-maverick}"
EVAL_MODEL="${EVAL_MODEL_PAYLOAD:-gpt-4o}"
PARALLEL_RUNS="${PARALLEL_RUNS_PAYLOAD:-2}"
MAX_STEPS="${MAX_STEPS_PAYLOAD:-25}"
START_INDEX="${START_INDEX_PAYLOAD:-0}"
EVAL_GROUP="${EVAL_GROUP_PAYLOAD:-PRTests}"
USER_MESSAGE="${USER_MESSAGE_PAYLOAD}"
DEVELOPER_ID="${DEVELOPER_ID_PAYLOAD}"
FRESH_START_VALUE="${FRESH_START_PAYLOAD:-true}" # Default for fresh_start is true
CMD="python eval/service.py \
--model \"$MODEL\" \
--eval-model \"$EVAL_MODEL\" \
--parallel-runs \"$PARALLEL_RUNS\" \
--max-steps \"$MAX_STEPS\" \
--start \"$START_INDEX\""
# Handle optional end_index with its default
if [[ -n "$END_INDEX_PAYLOAD" ]]; then
CMD="$CMD --end \"$END_INDEX_PAYLOAD\""
else
CMD="$CMD --end 100" # Default end_index if not in client_payload
fi
# Handle boolean flags (only add if true or default to true)
if [[ "$NO_VISION_PAYLOAD" == "true" ]]; then
CMD="$CMD --no-vision"
fi
# --headless is action='store_true', default for input was true.
# So, add --headless if payload is "true" or if payload is empty (use default "true")
HEADLESS_EFFECTIVE_VALUE="${HEADLESS_PAYLOAD:-true}"
if [[ "$HEADLESS_EFFECTIVE_VALUE" == "true" ]]; then
CMD="$CMD --headless"
fi
CMD="$CMD --fresh-start $FRESH_START_VALUE \
--eval-group \"$EVAL_GROUP\" \
# Only add user-message if it was provided in the payload
if [[ -n "$USER_MESSAGE_PAYLOAD" ]]; then
CMD="$CMD --user-message \"$USER_MESSAGE_PAYLOAD\""
fi
# Only add developer-id if it was provided in the payload
if [[ -n "$DEVELOPER_ID_PAYLOAD" ]]; then
CMD="$CMD --developer-id \"$DEVELOPER_ID_PAYLOAD\""
fi
# Prepend xvfb-run if running in headed mode
if [[ "$HEADLESS_EFFECTIVE_VALUE" == "false" ]]; then
CMD="xvfb-run --auto-servernum --server-args='-screen 0 1280x1024x24' $CMD"
fi
echo "FULL_COMMAND=$CMD" >> $GITHUB_OUTPUT
echo "::notice title=Eval Command::$CMD"
- name: Run evaluation script
run: ${{ steps.eval_command.outputs.FULL_COMMAND }}