diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml index 43e1964b0..24873b780 100644 --- a/.github/workflows/eval.yaml +++ b/.github/workflows/eval.yaml @@ -23,6 +23,7 @@ jobs: EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }} EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }} ANCHOR_BROWSER_API_KEY: ${{ secrets.ANCHOR_BROWSER_API_KEY }} + BRIGHTDATA_CDP_URL: ${{ secrets.BRIGHTDATA_CDP_URL }} SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }} LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }} BROWSER_USE_LOGGING_LEVEL: ${{ secrets.BROWSER_USE_LOGGING_LEVEL }} @@ -251,6 +252,7 @@ jobs: USER_MESSAGE="${{ github.event.client_payload.script_args.user_message }}" DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}" PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}" + BROWSER="${{ github.event.client_payload.script_args.browser }}" RUN_ID="${{ github.event.client_payload.script_args.run_id }}" LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}" # Pass raw GitHub Actions object to Python - no parsing in bash @@ -282,7 +284,6 @@ jobs: [[ "${{ github.event.client_payload.script_args.no_vision }}" == "true" ]] && CMD_ARGS+=("--no-vision") [[ "$HEADLESS" == "true" ]] && CMD_ARGS+=("--headless") [[ "${{ github.event.client_payload.script_args.use_serp }}" == "true" ]] && CMD_ARGS+=("--use-serp") - [[ "${{ github.event.client_payload.script_args.use_anchor }}" == "true" ]] && CMD_ARGS+=("--use-anchor") [[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory") [[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output") [[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result") @@ -294,6 +295,7 @@ jobs: [[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE") [[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID") [[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL") + [[ -n "$BROWSER" ]] && CMD_ARGS+=("--browser" "$BROWSER") [[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID") [[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID") diff --git a/eval/service.py b/eval/service.py index 49faba030..ba8e0d410 100644 --- a/eval/service.py +++ b/eval/service.py @@ -44,6 +44,7 @@ import base64 import io import json import logging +import os import re import signal import sys @@ -54,6 +55,8 @@ from uuid import UUID import anyio import psutil +import requests +from dotenv import load_dotenv from lmnr import AsyncLaminarClient, Laminar, observe from PIL import Image from pydantic import BaseModel @@ -71,6 +74,60 @@ MAX_IMAGE = 5 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s: %(message)s') logger = logging.getLogger(__name__) +# Load dotenv +load_dotenv() + +# Check for Anchor Browser API key +ANCHOR_BROWSER_API_KEY = os.getenv('ANCHOR_BROWSER_API_KEY') +if ANCHOR_BROWSER_API_KEY: + logger.info('ANCHOR_BROWSER_API_KEY is set. Tasks can use Anchor Browser.') +else: + logger.warning('ANCHOR_BROWSER_API_KEY is not set. Anchor Browser will not be available.') + +# Check for Brightdata CDP URL +BRIGHTDATA_CDP_URL = os.getenv('BRIGHTDATA_CDP_URL') +if BRIGHTDATA_CDP_URL: + logger.info('BRIGHTDATA_CDP_URL is set. Tasks can use Brightdata browser.') +else: + logger.warning('BRIGHTDATA_CDP_URL is not set. Brightdata browser will not be available.') + + +def create_anchor_browser_session(headless: bool = False) -> str: + """Create an Anchor Browser session and return CDP URL""" + browser_configuration = { + 'session': {'proxy': {'type': 'anchor_mobile', 'active': True, 'country_code': 'us'}}, + 'browser': { + 'adblock': {'active': True}, + 'captcha_solver': {'active': True}, + 'headless': {'active': headless}, + 'extra_stealth': {'active': True}, + }, + } + + try: + response = requests.post( + 'https://api.anchorbrowser.io/v1/sessions', + headers={ + 'anchor-api-key': ANCHOR_BROWSER_API_KEY, + 'Content-Type': 'application/json', + }, + json=browser_configuration, + ) + response.raise_for_status() + session_data = response.json()['data'] + session_id = session_data['id'] + + # Return only the CDP URL + return f'wss://connect.anchorbrowser.io?apiKey={ANCHOR_BROWSER_API_KEY}&sessionId={session_id}' + + except requests.RequestException as e: + logger.error(f'Failed to create Anchor Browser session: {type(e).__name__}: {e}') + raise + except KeyError as e: + logger.error(f'Unexpected response format from Anchor Browser API: {e}') + raise + + Laminar.initialize() laminar_client = AsyncLaminarClient() @@ -493,7 +550,6 @@ from dataclasses import dataclass, field from enum import Enum from typing import Any -import requests from dotenv import load_dotenv # Import the new comprehensive judge system (conditional import for backwards compatibility) @@ -1181,6 +1237,7 @@ class Task: self.login_type = kwargs.get('login_type', None) self.category = kwargs.get('category', None) self.output_schema = kwargs.get('output_schema', None) # Add structured output schema support + self.auth_keys = kwargs.get('auth_keys', None) # List of auth keys to fetch from auth distribution if self.output_schema: # Convert JSON schema to Pydantic model class self.output_model = create_pydantic_model_from_schema(self.output_schema, f'Task_{self.task_id}_Output') @@ -1197,6 +1254,7 @@ class Task: 'login_type', 'category', 'output_schema', + 'auth_keys', } self.additional_fields = {k: v for k, v in kwargs.items() if k not in known_fields} @@ -1206,7 +1264,7 @@ class Task: def __str__(self): # Include main fields and indicate if there are additional fields - base_str = f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level}, cluster_id={self.cluster_id}, login_cookie={self.login_cookie}, login_type={self.login_type}, category={self.category}, output_schema={self.output_schema}' + base_str = f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level}, cluster_id={self.cluster_id}, login_cookie={self.login_cookie}, login_type={self.login_type}, category={self.category}, output_schema={self.output_schema}, auth_keys={self.auth_keys}' if self.additional_fields: additional_str = ', '.join(f'{k}={v}' for k, v in self.additional_fields.items()) base_str += f', {additional_str}' @@ -1390,11 +1448,45 @@ async def run_stage(stage: Stage, stage_func, timeout: int | None = None): return await stage_func() -async def setup_browser_session(task: Task, headless: bool, highlight_elements: bool = True) -> BrowserSession: +async def setup_browser_session( + task: Task, headless: bool, highlight_elements: bool = True, browser: str = 'local' +) -> BrowserSession: """Setup browser session for the task""" - logger.debug(f'Browser setup: Initializing BrowserSession for task {task.task_id}') - # Use incognito mode (user_data_dir=None) for evaluations to avoid state pollution + # Validate browser option + valid_browsers = ['local', 'anchor-browser', 'brightdata', 'browser-use'] + if browser not in valid_browsers: + logger.warning(f'Browser setup: Invalid browser option "{browser}". Falling back to local browser.') + browser = 'local' + + cdp_url = None + + if browser == 'anchor-browser': + if ANCHOR_BROWSER_API_KEY: + try: + logger.debug(f'Browser setup: Creating Anchor Browser session for task {task.task_id}') + cdp_url = await asyncio.to_thread(create_anchor_browser_session, headless) + except Exception as e: + logger.error( + f'Browser setup: Failed to create Anchor Browser session for task {task.task_id}: {type(e).__name__}: {e}' + ) + logger.info(f'Browser setup: Falling back to local browser for task {task.task_id}') + cdp_url = None + else: + logger.warning( + f'Browser setup: Anchor Browser requested but ANCHOR_BROWSER_API_KEY not set. Using local browser for task {task.task_id}' + ) + elif browser == 'brightdata': + if BRIGHTDATA_CDP_URL: + logger.debug(f'Browser setup: Using Brightdata CDP URL for task {task.task_id}') + cdp_url = BRIGHTDATA_CDP_URL + else: + logger.warning( + f'Browser setup: Brightdata requested but BRIGHTDATA_CDP_URL not set. Using local browser for task {task.task_id}' + ) + elif browser == 'browser-use': + logger.warning(f'Browser setup: Browser-use not implemented yet. Falling back to local browser for task {task.task_id}') + profile_kwargs = { 'user_data_dir': None, # Incognito mode - no persistent state 'headless': headless, @@ -1433,10 +1525,16 @@ async def setup_browser_session(task: Task, headless: bool, highlight_elements: logger.debug(f'Login task {task.task_id}: Configured to save cookies to {storage_state_path}') profile = BrowserProfile(**profile_kwargs) - browser_session = BrowserSession(browser_profile=profile) + + if cdp_url: + logger.debug(f'Browser setup: Using CDP Browser for task {task.task_id}') + browser_session = BrowserSession(browser_profile=profile, cdp_url=cdp_url) + else: + # Use local browser + logger.debug(f'Browser setup: Initializing BrowserSession for task {task.task_id}') + browser_session = BrowserSession(browser_profile=profile) # Start browser session - logger.debug(f'Browser setup: Starting browser session for task {task.task_id}') await browser_session.start() logger.debug(f'Browser setup: Browser session started for task {task.task_id}') @@ -1570,11 +1668,11 @@ async def evaluate_task_with_login_cookie(login_cookie: str, task_folder: Path) If not found in tracking, falls back to checking end-state cookies. Args: - login_cookie: String identifier that should appear in cookies if login was successful - task_folder: Path to the task result folder containing saved cookies + login_cookie: String identifier that should appear in cookies if login was successful + task_folder: Path to the task result folder containing saved cookies Returns: - Dictionary containing evaluation results similar to Online_Mind2Web_eval format + Dictionary containing evaluation results similar to Online_Mind2Web_eval format """ task_id = task_folder.name @@ -1767,8 +1865,10 @@ async def run_task_with_semaphore( headless: bool, use_vision: bool, semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument + auth_distribution: dict | None = None, # Pre-fetched auth distribution github_workflow_url: str | None = None, use_serp: bool = False, + browser: str = 'local', enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -1864,7 +1964,9 @@ async def run_task_with_semaphore( ) browser_session = await run_stage( - Stage.SETUP_BROWSER, lambda: setup_browser_session(task, headless, highlight_elements), timeout=120 + Stage.SETUP_BROWSER, + lambda: setup_browser_session(task, headless, highlight_elements, browser), + timeout=120, ) task_result.stage_completed(Stage.SETUP_BROWSER) logger.info(f'Task {task.task_id}: Browser session started successfully.') @@ -1892,6 +1994,38 @@ async def run_task_with_semaphore( convex_url, secret_key, run_id, task.task_id, 'run_agent', 'active', github_workflow_url ) + # Handle auth information if task requires it + task_with_auth = task + if hasattr(task, 'auth_keys') and task.auth_keys: + # Validate auth_keys is a list + if isinstance(task.auth_keys, list) and len(task.auth_keys) > 0: + if auth_distribution: + logger.info( + f'Task {task.task_id}: Using pre-fetched auth distribution for auth_keys: {task.auth_keys}' + ) + auth_info_text = format_auth_info_for_agent(auth_distribution, task.auth_keys) + if auth_info_text: + # Create a modified task with auth info appended + class TaskWithAuth(Task): + def __init__(self, original_task: Task, auth_text: str): + # Copy all attributes from original task + for attr_name in dir(original_task): + if not attr_name.startswith('__'): + setattr(self, attr_name, getattr(original_task, attr_name)) + # Modify the confirmed_task to include auth info + self.confirmed_task = original_task.confirmed_task + auth_text + + task_with_auth = TaskWithAuth(task, auth_info_text) + logger.info(f'Task {task.task_id}: Auth info added to task description') + else: + logger.warning( + f'Task {task.task_id}: No matching auth info found for keys: {task.auth_keys}' + ) + else: + logger.warning(f'Task {task.task_id}: Auth keys specified but no auth distribution available') + else: + logger.warning(f'Task {task.task_id}: auth_keys is not a valid list: {task.auth_keys}') + # Start timing for agent execution only agent_start_time = time.time() @@ -1899,7 +2033,7 @@ async def run_task_with_semaphore( Stage.RUN_AGENT, lambda: run_agent_with_browser( browser_session, - task, + task_with_auth, llm, max_steps_per_task, use_vision, @@ -2163,6 +2297,7 @@ async def run_multiple_tasks( convex_url: str, secret_key: str, eval_model: BaseChatModel, + auth_distribution: dict | None = None, github_workflow_url: str | None = None, max_parallel_runs: int = 3, max_steps_per_task: int = 25, @@ -2171,6 +2306,7 @@ async def run_multiple_tasks( headless: bool = False, use_vision: bool = True, use_serp: bool = False, + browser: str = 'local', enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -2249,8 +2385,10 @@ async def run_multiple_tasks( headless=headless, use_vision=use_vision, semaphore_runs=semaphore_runs, # Pass the semaphore + auth_distribution=auth_distribution, # Pass the pre-fetched auth distribution github_workflow_url=github_workflow_url, use_serp=use_serp, + browser=browser, enable_memory=enable_memory, memory_interval=memory_interval, max_actions_per_step=max_actions_per_step, @@ -2373,6 +2511,109 @@ def fetch_tasks_from_server(convex_url: str, secret_key: str, test_case_name: st return None +# Helper function to fetch auth distribution from the server +def fetch_auth_distribution_from_server(convex_url: str, secret_key: str): + """Fetches an available auth distribution from the Convex HTTP endpoint.""" + + if not convex_url: + logger.error('Error: EVALUATION_TOOL_URL environment variable not set.') + return None + + if not secret_key: + logger.error('Error: EVALUATION_TOOL_SECRET_KEY environment variable not set.') + return None + + endpoint_url = f'{convex_url}/api/getAuthDistribution' + headers = { + 'Authorization': f'Bearer {secret_key}', + 'Content-Type': 'application/json', + } + + logger.info(f'Fetching auth distribution from {endpoint_url}...') + + try: + response = requests.post(endpoint_url, headers=headers, json={}) + + logger.info(f'Fetch Auth Distribution Status Code: {response.status_code}') + + if response.status_code == 200: + try: + data = response.json() + logger.info('Successfully fetched auth distribution data.') + # Verify the response has the expected structure + if isinstance(data, dict) and 'id' in data and 'loginInfo' in data: + return data + else: + logger.error( + f'Error: Fetched auth distribution data has unexpected structure. Keys: {list(data.keys()) if isinstance(data, dict) else "Not a dict"}' + ) + logger.error(f'Raw response: {response.text}') + return None + + except json.JSONDecodeError: + logger.error('Error: Failed to decode JSON response for auth distribution.') + logger.error(f'Raw response text: {response.text}') + return None + elif response.status_code == 404: + logger.warning('No available auth distribution found on server.') + return None + else: + logger.error(f'Error: Failed to fetch auth distribution. Status: {response.status_code}') + logger.error(f'Response: {response.text}') + return None + + except requests.exceptions.RequestException as e: + logger.error(f'Error during request to fetch auth distribution: {type(e).__name__}: {e}') + return None + + +# Helper function to format auth information for the agent +def format_auth_info_for_agent(auth_distribution: dict, auth_keys: list[str]) -> str: + """ + Formats auth information from auth distribution for the agent task description. + + Args: + auth_distribution: Dict with 'loginInfo' key containing auth data + auth_keys: List of auth keys to extract (e.g., ['google', 'facebook']) + + Returns: + Formatted string with login credentials or empty string if no matching keys + """ + if not auth_distribution or not auth_keys: + return '' + + login_info = auth_distribution.get('loginInfo', {}) + if not login_info: + logger.warning('Auth distribution has no loginInfo') + return '' + + # Extract relevant auth information based on auth_keys + relevant_auths = [] + for auth_key in auth_keys: + if auth_key in login_info: + auth_data = login_info[auth_key] + if isinstance(auth_data, dict): + # Format the auth data for this key + auth_details = [] + for key, value in auth_data.items(): + auth_details.append(f'{key}: {value}') + + if auth_details: + relevant_auths.append(f'{auth_key} with {", ".join(auth_details)}') + else: + logger.warning(f"Auth data for key '{auth_key}' is not a dictionary: {type(auth_data)}") + else: + logger.warning(f"Auth key '{auth_key}' not found in available login info. Available keys: {list(login_info.keys())}") + + if relevant_auths: + auth_text = f'\n\nThe following login credentials can be used to complete this task: {"; ".join(relevant_auths)}.' + logger.info(f'Formatted auth info: {auth_text}') + return auth_text + else: + logger.warning(f'No matching auth keys found. Requested: {auth_keys}, Available: {list(login_info.keys())}') + return '' + + # Helper function to get git information def get_git_info(): """Retrieves git branch, commit hash, commit timestamp, and repository URL using subprocess.""" @@ -2602,6 +2843,7 @@ async def run_evaluation_pipeline( convex_url: str, secret_key: str, eval_model: BaseChatModel, + auth_distribution: dict | None = None, github_workflow_url: str | None = None, max_parallel_runs: int = 3, max_steps_per_task: int = 25, @@ -2610,6 +2852,7 @@ async def run_evaluation_pipeline( headless: bool = False, use_vision: bool = True, use_serp: bool = False, + browser: str = 'local', enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -2656,6 +2899,7 @@ async def run_evaluation_pipeline( convex_url=convex_url, secret_key=secret_key, eval_model=eval_model, + auth_distribution=auth_distribution, github_workflow_url=github_workflow_url, max_parallel_runs=max_parallel_runs, max_steps_per_task=max_steps_per_task, @@ -2664,6 +2908,7 @@ async def run_evaluation_pipeline( headless=headless, use_vision=use_vision, use_serp=use_serp, + browser=browser, enable_memory=enable_memory, memory_interval=memory_interval, max_actions_per_step=max_actions_per_step, @@ -2796,6 +3041,12 @@ if __name__ == '__main__': parser.add_argument('--eval-group', type=str, default='', help='Evaluation group to include in the run') parser.add_argument('--developer-id', type=str, default=None, help='Name of the developer starting the run') parser.add_argument('--use-serp', action='store_true', help='Use SERP search instead of Google search') + parser.add_argument( + '--browser', + type=str, + default='local', + help='Browser to use: local, anchor-browser, brightdata, browser-use (default: local)', + ) parser.add_argument('--enable-memory', action='store_true', help='Enable mem0 memory system for agents') parser.add_argument('--memory-interval', type=int, default=10, help='Memory creation interval (default: 10 steps)') parser.add_argument('--max-actions-per-step', type=int, default=10, help='Maximum number of actions per step (default: 10)') @@ -2837,7 +3088,6 @@ if __name__ == '__main__': ) parser.add_argument('--use-mind2web-judge', action='store_true', help='Use original judge') parser.add_argument('--no-thinking', action='store_true', help='Disable thinking in agent system prompt') - parser.add_argument('--use-anchor', action='store_true', help='Use anchor to navigate to the page') parser.add_argument('--github-workflow-url', type=str, default=None, help='GitHub workflow URL for tracking') # Gmail 2FA support arguments @@ -2934,6 +3184,7 @@ if __name__ == '__main__': # --- Load Tasks (Either Single Task or from Server) --- tasks = [] task_id = None # Initialize for proper scoping + auth_distribution = None # Initialize auth distribution # Check if this is single task mode if args.task_text: @@ -2968,10 +3219,28 @@ if __name__ == '__main__': logger.info(f'Successfully loaded {len(tasks)} tasks from the server.') except (TypeError, ValueError) as e: logger.error( - f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}' + f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category, auth_keys. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}' ) logger.error(f'First item in fetched data: {fetched_task_data[0] if fetched_task_data else "None"}') exit(1) + + # --- Fetch Auth Distribution Once (if any tasks need auth) --- + tasks_with_auth = [ + task + for task in tasks + if hasattr(task, 'auth_keys') and task.auth_keys and isinstance(task.auth_keys, list) and len(task.auth_keys) > 0 + ] + if tasks_with_auth and CONVEX_URL and SECRET_KEY: + logger.info(f'Found {len(tasks_with_auth)} tasks requiring auth. Fetching auth distribution...') + auth_distribution = fetch_auth_distribution_from_server(CONVEX_URL, SECRET_KEY) + if auth_distribution: + logger.info( + f'Successfully fetched auth distribution with login info for: {list(auth_distribution.get("loginInfo", {}).keys())}' + ) + else: + logger.warning('Failed to fetch auth distribution. Tasks requiring auth may fail.') + elif tasks_with_auth: + logger.warning(f'Found {len(tasks_with_auth)} tasks requiring auth but no server config available') # ----------------------------- # --- Start Run on Server (with optional existing Run ID) --- @@ -3048,6 +3317,22 @@ if __name__ == '__main__': else: logger.info('🔍 Using default Google search') + # Log browser mode being used + if args.browser == 'anchor-browser': + if ANCHOR_BROWSER_API_KEY: + logger.info('🌐 Using Anchor Browser (remote browser service)') + else: + logger.warning('⚠️ --browser anchor-browser provided but ANCHOR_BROWSER_API_KEY not set. Will use local browser!') + elif args.browser == 'brightdata': + if BRIGHTDATA_CDP_URL: + logger.info('🌐 Using Brightdata browser (remote browser service)') + else: + logger.warning('⚠️ --browser brightdata provided but BRIGHTDATA_CDP_URL not set. Will use local browser!') + elif args.browser == 'browser-use': + logger.warning('🌐 Browser-use not implemented yet. Will use local browser!') + else: + logger.info('🌐 Using local browser') + # Log memory configuration if args.enable_memory: logger.info(f'🧠 Memory enabled: mem0 system with interval={args.memory_interval} steps') @@ -3137,6 +3422,7 @@ if __name__ == '__main__': convex_url=convex_url, secret_key=secret_key, eval_model=eval_model, + auth_distribution=auth_distribution, github_workflow_url=args.github_workflow_url, max_parallel_runs=parallel_runs, max_steps_per_task=args.max_steps, @@ -3145,6 +3431,7 @@ if __name__ == '__main__': headless=args.headless, use_vision=not args.no_vision, use_serp=args.use_serp, + browser=args.browser, enable_memory=args.enable_memory, memory_interval=args.memory_interval, max_actions_per_step=args.max_actions_per_step,