From b6482bd1942b75cc06641f2b974a9f69dd72dddd Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Sun, 1 Jun 2025 11:39:10 +0200 Subject: [PATCH 01/14] feat: add support for anchor browser when running evals --- eval/service.py | 82 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/eval/service.py b/eval/service.py index ca502fd5c..fce8e41d1 100644 --- a/eval/service.py +++ b/eval/service.py @@ -42,10 +42,12 @@ import asyncio import base64 import io import logging +import os import re import shutil import anyio +import requests from PIL import Image MAX_IMAGE = 5 @@ -54,6 +56,37 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %( logger = logging.getLogger(__name__) +def create_anchor_browser_session(api_key: str, headless: bool = False) -> str: + """Create an Anchor Browser session and return CDP URL""" + browser_configuration = { + 'session': {'proxy': {'type': 'anchor_residential', 'active': True}}, + 'browser': {'adblock': {'active': True}, 'captcha_solver': {'active': True}, 'headless': {'active': headless}}, + } + + try: + response = requests.post( + 'https://api.anchorbrowser.io/v1/sessions', + headers={ + 'anchor-api-key': api_key, + 'Content-Type': 'application/json', + }, + json=browser_configuration, + ) + response.raise_for_status() + session_data = response.json()['data'] + session_id = session_data['id'] + + # Return only the CDP URL + return f'wss://connect.anchorbrowser.io?apiKey={api_key}&sessionId={session_id}' + + except requests.RequestException as e: + logger.error(f'Failed to create Anchor Browser session: {type(e).__name__}: {e}') + raise + except KeyError as e: + logger.error(f'Unexpected response format from Anchor Browser API: {e}') + raise + + def encode_image(image): """Convert a PIL image to base64 string.""" if image.mode == 'RGBA': @@ -273,13 +306,11 @@ async def Online_Mind2Web_eval_with_retry(task, last_actions, images_path, model import argparse import http.client import json -import os import subprocess import time from datetime import datetime from pathlib import Path -import requests from dotenv import load_dotenv from langchain_anthropic import ChatAnthropic from langchain_core.language_models.chat_models import BaseChatModel @@ -1009,20 +1040,41 @@ async def load_existing_result(task_folder: Path) -> dict: async def setup_browser_session(task: Task, headless: bool) -> BrowserSession: """Setup browser session for the task""" - logger.debug(f'Browser setup: Creating unique user data directory for task {task.task_id}') - # Create unique user data directory - base_user_data_dir = Path(BrowserProfile().user_data_dir).parent - unique_user_data_dir = base_user_data_dir / f'task_{task.task_id}' - unique_user_data_dir.mkdir(parents=True, exist_ok=True) - logger.debug(f'Browser setup: Initializing BrowserSession for task {task.task_id}') - browser_session = BrowserSession( - browser_profile=BrowserProfile( - user_data_dir=str(unique_user_data_dir), - headless=headless, - chromium_sandbox=False, - ), - ) + # Check for Anchor Browser API key + anchor_api_key = os.getenv('ANCHOR_API_KEY') + cdp_url = None + + if anchor_api_key: + try: + logger.debug(f'Browser setup: Creating Anchor Browser session for task {task.task_id}') + cdp_url = await asyncio.to_thread(create_anchor_browser_session, anchor_api_key, headless) + except Exception as e: + logger.error( + f'Browser setup: Failed to create Anchor Browser session for task {task.task_id}: {type(e).__name__}: {e}' + ) + logger.info(f'Browser setup: Falling back to local browser for task {task.task_id}') + cdp_url = None + + if cdp_url: + # Use Anchor Browser + browser_session = BrowserSession(cdp_url=cdp_url) + else: + # Use local browser + logger.debug(f'Browser setup: Creating unique user data directory for task {task.task_id}') + # Create unique user data directory + base_user_data_dir = Path(BrowserProfile().user_data_dir).parent + unique_user_data_dir = base_user_data_dir / f'task_{task.task_id}' + unique_user_data_dir.mkdir(parents=True, exist_ok=True) + + logger.debug(f'Browser setup: Initializing BrowserSession for task {task.task_id}') + browser_session = BrowserSession( + browser_profile=BrowserProfile( + user_data_dir=str(unique_user_data_dir), + headless=headless, + chromium_sandbox=False, + ) + ) # Start browser session logger.debug(f'Browser setup: Starting browser session for task {task.task_id}') From 6f438845da2a521e4534022b2201db482a3c82eb Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Sun, 1 Jun 2025 11:50:41 +0200 Subject: [PATCH 02/14] fix: improve anchor browser variable name --- eval/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval/service.py b/eval/service.py index fce8e41d1..3a43595de 100644 --- a/eval/service.py +++ b/eval/service.py @@ -1042,7 +1042,7 @@ async def setup_browser_session(task: Task, headless: bool) -> BrowserSession: """Setup browser session for the task""" # Check for Anchor Browser API key - anchor_api_key = os.getenv('ANCHOR_API_KEY') + anchor_api_key = os.getenv('ANCHOR_BROWSER_API_KEY') cdp_url = None if anchor_api_key: From 6208be68b0113b6bb30a244cbef24a75eaeac0de Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Sun, 1 Jun 2025 12:27:09 +0200 Subject: [PATCH 03/14] fix: improved anchor browser variable logic --- eval/service.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/eval/service.py b/eval/service.py index 3a43595de..5615f184d 100644 --- a/eval/service.py +++ b/eval/service.py @@ -55,8 +55,12 @@ MAX_IMAGE = 5 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +# Check for Anchor Browser API key +ANCHOR_BROWSER_API_KEY = os.getenv('ANCHOR_BROWSER_API_KEY') +if not ANCHOR_BROWSER_API_KEY: + logger.warning('ANCHOR_BROWSER_API_KEY is not set. Tasks will use local browser.') -def create_anchor_browser_session(api_key: str, headless: bool = False) -> str: +def create_anchor_browser_session(headless: bool = False) -> str: """Create an Anchor Browser session and return CDP URL""" browser_configuration = { 'session': {'proxy': {'type': 'anchor_residential', 'active': True}}, @@ -67,7 +71,7 @@ def create_anchor_browser_session(api_key: str, headless: bool = False) -> str: response = requests.post( 'https://api.anchorbrowser.io/v1/sessions', headers={ - 'anchor-api-key': api_key, + 'anchor-api-key': ANCHOR_BROWSER_API_KEY, 'Content-Type': 'application/json', }, json=browser_configuration, @@ -77,7 +81,7 @@ def create_anchor_browser_session(api_key: str, headless: bool = False) -> str: session_id = session_data['id'] # Return only the CDP URL - return f'wss://connect.anchorbrowser.io?apiKey={api_key}&sessionId={session_id}' + return f'wss://connect.anchorbrowser.io?apiKey={ANCHOR_BROWSER_API_KEY}&sessionId={session_id}' except requests.RequestException as e: logger.error(f'Failed to create Anchor Browser session: {type(e).__name__}: {e}') @@ -1042,13 +1046,12 @@ async def setup_browser_session(task: Task, headless: bool) -> BrowserSession: """Setup browser session for the task""" # Check for Anchor Browser API key - anchor_api_key = os.getenv('ANCHOR_BROWSER_API_KEY') cdp_url = None - if anchor_api_key: + if ANCHOR_BROWSER_API_KEY: try: logger.debug(f'Browser setup: Creating Anchor Browser session for task {task.task_id}') - cdp_url = await asyncio.to_thread(create_anchor_browser_session, anchor_api_key, headless) + cdp_url = await asyncio.to_thread(create_anchor_browser_session, headless) except Exception as e: logger.error( f'Browser setup: Failed to create Anchor Browser session for task {task.task_id}: {type(e).__name__}: {e}' From bc284f19eb4417a29310e076ef2af57571e127c7 Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Sun, 1 Jun 2025 12:29:50 +0200 Subject: [PATCH 04/14] fix: improved anchor browser variable log --- eval/service.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/eval/service.py b/eval/service.py index 5615f184d..f5c3ae2a0 100644 --- a/eval/service.py +++ b/eval/service.py @@ -57,7 +57,9 @@ logger = logging.getLogger(__name__) # Check for Anchor Browser API key ANCHOR_BROWSER_API_KEY = os.getenv('ANCHOR_BROWSER_API_KEY') -if not ANCHOR_BROWSER_API_KEY: +if ANCHOR_BROWSER_API_KEY: + logger.info('ANCHOR_BROWSER_API_KEY is set. Tasks will use Anchor Browser.') +else: logger.warning('ANCHOR_BROWSER_API_KEY is not set. Tasks will use local browser.') def create_anchor_browser_session(headless: bool = False) -> str: From 0b02c590a7a327817c933b3f0b09f8ff83aec059 Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Sun, 1 Jun 2025 12:47:27 +0200 Subject: [PATCH 05/14] fix: load dotenv call --- eval/service.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/eval/service.py b/eval/service.py index f5c3ae2a0..fe4b2ee52 100644 --- a/eval/service.py +++ b/eval/service.py @@ -48,6 +48,7 @@ import shutil import anyio import requests +from dotenv import load_dotenv from PIL import Image MAX_IMAGE = 5 @@ -55,6 +56,9 @@ MAX_IMAGE = 5 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +# Load dotenv +load_dotenv() + # Check for Anchor Browser API key ANCHOR_BROWSER_API_KEY = os.getenv('ANCHOR_BROWSER_API_KEY') if ANCHOR_BROWSER_API_KEY: From ad3359a299e03e28a95b484309bcf09f66c6421d Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Sun, 1 Jun 2025 12:52:03 +0200 Subject: [PATCH 06/14] fix: expose ANCHOR_BROWSER_API_KEY variable in the workflow .yaml --- .github/workflows/eval.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml index d01853a62..c285f81a7 100644 --- a/.github/workflows/eval.yaml +++ b/.github/workflows/eval.yaml @@ -17,6 +17,7 @@ jobs: XAI_API_KEY: ${{ secrets.XAI_API_KEY }} EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }} EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }} + ANCHOR_BROWSER_API_KEY: ${{ secrets.ANCHOR_BROWSER_API_KEY }} steps: - name: Determine branch to checkout From c14a36329929d6f55588109b265bee5fad99bdc3 Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Mon, 30 Jun 2025 11:35:14 +0200 Subject: [PATCH 07/14] feat: added --use-anchor flag support --- eval/service.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/eval/service.py b/eval/service.py index fe4b2ee52..0a925c346 100644 --- a/eval/service.py +++ b/eval/service.py @@ -66,6 +66,7 @@ if ANCHOR_BROWSER_API_KEY: else: logger.warning('ANCHOR_BROWSER_API_KEY is not set. Tasks will use local browser.') + def create_anchor_browser_session(headless: bool = False) -> str: """Create an Anchor Browser session and return CDP URL""" browser_configuration = { @@ -1048,13 +1049,13 @@ async def load_existing_result(task_folder: Path) -> dict: return existing_result -async def setup_browser_session(task: Task, headless: bool) -> BrowserSession: +async def setup_browser_session(task: Task, headless: bool, use_anchor: bool = False) -> BrowserSession: """Setup browser session for the task""" - # Check for Anchor Browser API key + # Check for Anchor Browser API key and flag cdp_url = None - if ANCHOR_BROWSER_API_KEY: + if use_anchor and ANCHOR_BROWSER_API_KEY: try: logger.debug(f'Browser setup: Creating Anchor Browser session for task {task.task_id}') cdp_url = await asyncio.to_thread(create_anchor_browser_session, headless) @@ -1064,6 +1065,10 @@ async def setup_browser_session(task: Task, headless: bool) -> BrowserSession: ) logger.info(f'Browser setup: Falling back to local browser for task {task.task_id}') cdp_url = None + elif use_anchor and not ANCHOR_BROWSER_API_KEY: + logger.warning( + f'Browser setup: Anchor Browser requested but ANCHOR_BROWSER_API_KEY not set. Using local browser for task {task.task_id}' + ) if cdp_url: # Use Anchor Browser @@ -1194,6 +1199,7 @@ async def run_task_with_semaphore( semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument fresh_start: bool = True, use_serp: bool = False, + use_anchor: bool = False, enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -1236,7 +1242,7 @@ async def run_task_with_semaphore( try: logger.info(f'Task {task.task_id}: Browser setup starting.') browser_session = await run_stage( - Stage.SETUP_BROWSER, lambda: setup_browser_session(task, headless), timeout=120 + Stage.SETUP_BROWSER, lambda: setup_browser_session(task, headless, use_anchor), timeout=120 ) task_result.stage_completed(Stage.SETUP_BROWSER) logger.info(f'Task {task.task_id}: Browser session started successfully.') @@ -1431,6 +1437,7 @@ async def run_multiple_tasks( use_vision: bool = True, fresh_start: bool = True, use_serp: bool = False, + use_anchor: bool = False, enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -1463,6 +1470,7 @@ async def run_multiple_tasks( semaphore_runs=semaphore_runs, # Pass the semaphore fresh_start=fresh_start, use_serp=use_serp, + use_anchor=use_anchor, enable_memory=enable_memory, memory_interval=memory_interval, max_actions_per_step=max_actions_per_step, @@ -1704,6 +1712,7 @@ if __name__ == '__main__': parser.add_argument('--eval-group', type=str, default='', help='Evaluation group to include in the run') parser.add_argument('--developer-id', type=str, default=None, help='Name of the developer starting the run') parser.add_argument('--use-serp', action='store_true', help='Use SERP search instead of Google search') + parser.add_argument('--use-anchor', action='store_true', help='Use Anchor Browser (requires ANCHOR_BROWSER_API_KEY)') parser.add_argument('--enable-memory', action='store_true', help='Enable mem0 memory system for agents') parser.add_argument('--memory-interval', type=int, default=10, help='Memory creation interval (default: 10 steps)') parser.add_argument('--max-actions-per-step', type=int, default=10, help='Maximum number of actions per step (default: 10)') @@ -1840,6 +1849,15 @@ if __name__ == '__main__': else: logger.info('🔍 Using default Google search') + # Log browser mode being used + if args.use_anchor: + if ANCHOR_BROWSER_API_KEY: + logger.info('🌐 Using Anchor Browser (remote browser service)') + else: + logger.warning('⚠️ --use-anchor flag provided but ANCHOR_BROWSER_API_KEY not set. Will use local browser!') + else: + logger.info('🌐 Using local browser') + # Log memory configuration if args.enable_memory: logger.info(f'🧠 Memory enabled: mem0 system with interval={args.memory_interval} steps') @@ -1912,6 +1930,7 @@ if __name__ == '__main__': use_vision=not args.no_vision, fresh_start=args.fresh_start, use_serp=args.use_serp, + use_anchor=args.use_anchor, enable_memory=args.enable_memory, memory_interval=args.memory_interval, max_actions_per_step=args.max_actions_per_step, From 3b38d956016d349e04b6558ded31e8d7ff79afa4 Mon Sep 17 00:00:00 2001 From: BroskyBrowser Date: Mon, 30 Jun 2025 20:26:45 +0200 Subject: [PATCH 08/14] chore: linter --- eval/service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eval/service.py b/eval/service.py index 5c32cd544..0ea745eff 100644 --- a/eval/service.py +++ b/eval/service.py @@ -2333,6 +2333,7 @@ async def run_evaluation_pipeline( headless: bool = False, use_vision: bool = True, use_serp: bool = False, + use_anchor: bool = False, enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -2385,6 +2386,7 @@ async def run_evaluation_pipeline( headless=headless, use_vision=use_vision, use_serp=use_serp, + use_anchor=use_anchor, enable_memory=enable_memory, memory_interval=memory_interval, max_actions_per_step=max_actions_per_step, From e8db375401582cc83bc1d4d86cdb0ec84dfb4576 Mon Sep 17 00:00:00 2001 From: Alezander9 Date: Wed, 2 Jul 2025 16:30:48 -0700 Subject: [PATCH 09/14] make eval service fetch rotating auth info from server --- eval/service.py | 168 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 3 deletions(-) diff --git a/eval/service.py b/eval/service.py index 0a2a63018..d5386a1e4 100644 --- a/eval/service.py +++ b/eval/service.py @@ -1119,6 +1119,7 @@ class Task: self.login_type = kwargs.get('login_type', None) self.category = kwargs.get('category', None) self.output_schema = kwargs.get('output_schema', None) # Add structured output schema support + self.auth_keys = kwargs.get('auth_keys', None) # List of auth keys to fetch from auth distribution if self.output_schema: # Convert JSON schema to Pydantic model class self.output_model = create_pydantic_model_from_schema(self.output_schema, f'Task_{self.task_id}_Output') @@ -1135,6 +1136,7 @@ class Task: 'login_type', 'category', 'output_schema', + 'auth_keys', } self.additional_fields = {k: v for k, v in kwargs.items() if k not in known_fields} @@ -1144,7 +1146,7 @@ class Task: def __str__(self): # Include main fields and indicate if there are additional fields - base_str = f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level}, cluster_id={self.cluster_id}, login_cookie={self.login_cookie}, login_type={self.login_type}, category={self.category}, output_schema={self.output_schema}' + base_str = f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level}, cluster_id={self.cluster_id}, login_cookie={self.login_cookie}, login_type={self.login_type}, category={self.category}, output_schema={self.output_schema}, auth_keys={self.auth_keys}' if self.additional_fields: additional_str = ', '.join(f'{k}={v}' for k, v in self.additional_fields.items()) base_str += f', {additional_str}' @@ -1603,6 +1605,7 @@ async def run_task_with_semaphore( headless: bool, use_vision: bool, semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument + auth_distribution: dict | None = None, # Pre-fetched auth distribution github_workflow_url: str | None = None, use_serp: bool = False, enable_memory: bool = False, @@ -1727,6 +1730,38 @@ async def run_task_with_semaphore( convex_url, secret_key, run_id, task.task_id, 'run_agent', 'active', github_workflow_url ) + # Handle auth information if task requires it + task_with_auth = task + if hasattr(task, 'auth_keys') and task.auth_keys: + # Validate auth_keys is a list + if isinstance(task.auth_keys, list) and len(task.auth_keys) > 0: + if auth_distribution: + logger.info( + f'Task {task.task_id}: Using pre-fetched auth distribution for auth_keys: {task.auth_keys}' + ) + auth_info_text = format_auth_info_for_agent(auth_distribution, task.auth_keys) + if auth_info_text: + # Create a modified task with auth info appended + class TaskWithAuth: + def __init__(self, original_task, auth_text): + # Copy all attributes from original task + for attr_name in dir(original_task): + if not attr_name.startswith('__'): + setattr(self, attr_name, getattr(original_task, attr_name)) + # Modify the confirmed_task to include auth info + self.confirmed_task = original_task.confirmed_task + auth_text + + task_with_auth = TaskWithAuth(task, auth_info_text) + logger.info(f'Task {task.task_id}: Auth info added to task description') + else: + logger.warning( + f'Task {task.task_id}: No matching auth info found for keys: {task.auth_keys}' + ) + else: + logger.warning(f'Task {task.task_id}: Auth keys specified but no auth distribution available') + else: + logger.warning(f'Task {task.task_id}: auth_keys is not a valid list: {task.auth_keys}') + # Start timing for agent execution only agent_start_time = time.time() @@ -1734,7 +1769,7 @@ async def run_task_with_semaphore( Stage.RUN_AGENT, lambda: run_agent_with_browser( browser_session, - task, + task_with_auth, llm, max_steps_per_task, use_vision, @@ -1987,6 +2022,7 @@ async def run_multiple_tasks( convex_url: str, secret_key: str, eval_model: BaseChatModel, + auth_distribution: dict | None = None, github_workflow_url: str | None = None, max_parallel_runs: int = 3, max_steps_per_task: int = 25, @@ -2072,6 +2108,7 @@ async def run_multiple_tasks( headless=headless, use_vision=use_vision, semaphore_runs=semaphore_runs, # Pass the semaphore + auth_distribution=auth_distribution, # Pass the pre-fetched auth distribution github_workflow_url=github_workflow_url, use_serp=use_serp, enable_memory=enable_memory, @@ -2195,6 +2232,109 @@ def fetch_tasks_from_server(convex_url: str, secret_key: str, test_case_name: st return None +# Helper function to fetch auth distribution from the server +def fetch_auth_distribution_from_server(convex_url: str, secret_key: str): + """Fetches an available auth distribution from the Convex HTTP endpoint.""" + + if not convex_url: + logger.error('Error: EVALUATION_TOOL_URL environment variable not set.') + return None + + if not secret_key: + logger.error('Error: EVALUATION_TOOL_SECRET_KEY environment variable not set.') + return None + + endpoint_url = f'{convex_url}/api/getAuthDistribution' + headers = { + 'Authorization': f'Bearer {secret_key}', + 'Content-Type': 'application/json', + } + + logger.info(f'Fetching auth distribution from {endpoint_url}...') + + try: + response = requests.post(endpoint_url, headers=headers, json={}) + + logger.info(f'Fetch Auth Distribution Status Code: {response.status_code}') + + if response.status_code == 200: + try: + data = response.json() + logger.info('Successfully fetched auth distribution data.') + # Verify the response has the expected structure + if isinstance(data, dict) and 'id' in data and 'loginInfo' in data: + return data + else: + logger.error( + f'Error: Fetched auth distribution data has unexpected structure. Keys: {list(data.keys()) if isinstance(data, dict) else "Not a dict"}' + ) + logger.error(f'Raw response: {response.text}') + return None + + except json.JSONDecodeError: + logger.error('Error: Failed to decode JSON response for auth distribution.') + logger.error(f'Raw response text: {response.text}') + return None + elif response.status_code == 404: + logger.warning('No available auth distribution found on server.') + return None + else: + logger.error(f'Error: Failed to fetch auth distribution. Status: {response.status_code}') + logger.error(f'Response: {response.text}') + return None + + except requests.exceptions.RequestException as e: + logger.error(f'Error during request to fetch auth distribution: {type(e).__name__}: {e}') + return None + + +# Helper function to format auth information for the agent +def format_auth_info_for_agent(auth_distribution: dict, auth_keys: list[str]) -> str: + """ + Formats auth information from auth distribution for the agent task description. + + Args: + auth_distribution: Dict with 'loginInfo' key containing auth data + auth_keys: List of auth keys to extract (e.g., ['google', 'facebook']) + + Returns: + Formatted string with login credentials or empty string if no matching keys + """ + if not auth_distribution or not auth_keys: + return '' + + login_info = auth_distribution.get('loginInfo', {}) + if not login_info: + logger.warning('Auth distribution has no loginInfo') + return '' + + # Extract relevant auth information based on auth_keys + relevant_auths = [] + for auth_key in auth_keys: + if auth_key in login_info: + auth_data = login_info[auth_key] + if isinstance(auth_data, dict): + # Format the auth data for this key + auth_details = [] + for key, value in auth_data.items(): + auth_details.append(f'{key}: {value}') + + if auth_details: + relevant_auths.append(f'{auth_key} with {", ".join(auth_details)}') + else: + logger.warning(f"Auth data for key '{auth_key}' is not a dictionary: {type(auth_data)}") + else: + logger.warning(f"Auth key '{auth_key}' not found in available login info. Available keys: {list(login_info.keys())}") + + if relevant_auths: + auth_text = f'\n\nThe following login credentials can be used to complete this task: {"; ".join(relevant_auths)}.' + logger.info(f'Formatted auth info: {auth_text}') + return auth_text + else: + logger.warning(f'No matching auth keys found. Requested: {auth_keys}, Available: {list(login_info.keys())}') + return '' + + # Helper function to get git information def get_git_info(): """Retrieves git branch, commit hash, commit timestamp, and repository URL using subprocess.""" @@ -2424,6 +2564,7 @@ async def run_evaluation_pipeline( convex_url: str, secret_key: str, eval_model: BaseChatModel, + auth_distribution: dict | None = None, github_workflow_url: str | None = None, max_parallel_runs: int = 3, max_steps_per_task: int = 25, @@ -2477,6 +2618,7 @@ async def run_evaluation_pipeline( convex_url=convex_url, secret_key=secret_key, eval_model=eval_model, + auth_distribution=auth_distribution, github_workflow_url=github_workflow_url, max_parallel_runs=max_parallel_runs, max_steps_per_task=max_steps_per_task, @@ -2585,6 +2727,7 @@ if __name__ == '__main__': # --- Load Tasks (Either Single Task or from Server) --- tasks = [] task_id = None # Initialize for proper scoping + auth_distribution = None # Initialize auth distribution # Check if this is single task mode if args.task_text: @@ -2619,10 +2762,28 @@ if __name__ == '__main__': logger.info(f'Successfully loaded {len(tasks)} tasks from the server.') except (TypeError, ValueError) as e: logger.error( - f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}' + f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category, auth_keys. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}' ) logger.error(f'First item in fetched data: {fetched_task_data[0] if fetched_task_data else "None"}') exit(1) + + # --- Fetch Auth Distribution Once (if any tasks need auth) --- + tasks_with_auth = [ + task + for task in tasks + if hasattr(task, 'auth_keys') and task.auth_keys and isinstance(task.auth_keys, list) and len(task.auth_keys) > 0 + ] + if tasks_with_auth and CONVEX_URL and SECRET_KEY: + logger.info(f'Found {len(tasks_with_auth)} tasks requiring auth. Fetching auth distribution...') + auth_distribution = fetch_auth_distribution_from_server(CONVEX_URL, SECRET_KEY) + if auth_distribution: + logger.info( + f'Successfully fetched auth distribution with login info for: {list(auth_distribution.get("loginInfo", {}).keys())}' + ) + else: + logger.warning('Failed to fetch auth distribution. Tasks requiring auth may fail.') + elif tasks_with_auth: + logger.warning(f'Found {len(tasks_with_auth)} tasks requiring auth but no server config available') # ----------------------------- # --- Start Run on Server (with optional existing Run ID) --- @@ -2788,6 +2949,7 @@ if __name__ == '__main__': convex_url=convex_url, secret_key=secret_key, eval_model=eval_model, + auth_distribution=auth_distribution, github_workflow_url=args.github_workflow_url, max_parallel_runs=parallel_runs, max_steps_per_task=args.max_steps, From 8beabf6970f3ad5ae92373ca34c076e4901a1be5 Mon Sep 17 00:00:00 2001 From: Alezander9 Date: Wed, 2 Jul 2025 17:01:35 -0700 Subject: [PATCH 10/14] fix typing --- eval/service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eval/service.py b/eval/service.py index d5386a1e4..2c46be696 100644 --- a/eval/service.py +++ b/eval/service.py @@ -1742,8 +1742,8 @@ async def run_task_with_semaphore( auth_info_text = format_auth_info_for_agent(auth_distribution, task.auth_keys) if auth_info_text: # Create a modified task with auth info appended - class TaskWithAuth: - def __init__(self, original_task, auth_text): + class TaskWithAuth(Task): + def __init__(self, original_task: Task, auth_text: str): # Copy all attributes from original task for attr_name in dir(original_task): if not attr_name.startswith('__'): From 18fe7620decc4c84c3292b69569f181f0f97d385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 4 Jul 2025 21:21:15 +0200 Subject: [PATCH 11/14] Remove memory logging --- eval/service.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/eval/service.py b/eval/service.py index a82cb4974..8ec72f4e0 100644 --- a/eval/service.py +++ b/eval/service.py @@ -3130,12 +3130,6 @@ if __name__ == '__main__': else: logger.info('🔍 Using default Google search') - # Log memory configuration - if args.enable_memory: - logger.info(f'🧠 Memory enabled: mem0 system with interval={args.memory_interval} steps') - else: - logger.info('🧠 Memory disabled') - # Log browser mode being used if args.use_anchor: if ANCHOR_BROWSER_API_KEY: From 2c564009f7d63c6fd664699fddfbc88b1386e057 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Fri, 4 Jul 2025 21:37:18 +0200 Subject: [PATCH 12/14] Remove unused anchor navigation argument from eval service --- eval/service.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eval/service.py b/eval/service.py index 8ec72f4e0..e8421209b 100644 --- a/eval/service.py +++ b/eval/service.py @@ -2919,7 +2919,6 @@ if __name__ == '__main__': ) parser.add_argument('--use-mind2web-judge', action='store_true', help='Use original judge') parser.add_argument('--no-thinking', action='store_true', help='Disable thinking in agent system prompt') - parser.add_argument('--use-anchor', action='store_true', help='Use anchor to navigate to the page') parser.add_argument('--github-workflow-url', type=str, default=None, help='GitHub workflow URL for tracking') # Gmail 2FA support arguments From 8754e22ce3beb987e50884a633b53cced3339851 Mon Sep 17 00:00:00 2001 From: reformedot Date: Fri, 4 Jul 2025 23:40:12 +0200 Subject: [PATCH 13/14] feat: added browser arg to the eval script --- .github/workflows/eval.yaml | 4 +- eval/service.py | 89 ++++++++++++++++++++++++++----------- 2 files changed, 65 insertions(+), 28 deletions(-) diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml index 43e1964b0..24873b780 100644 --- a/.github/workflows/eval.yaml +++ b/.github/workflows/eval.yaml @@ -23,6 +23,7 @@ jobs: EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }} EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }} ANCHOR_BROWSER_API_KEY: ${{ secrets.ANCHOR_BROWSER_API_KEY }} + BRIGHTDATA_CDP_URL: ${{ secrets.BRIGHTDATA_CDP_URL }} SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }} LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }} BROWSER_USE_LOGGING_LEVEL: ${{ secrets.BROWSER_USE_LOGGING_LEVEL }} @@ -251,6 +252,7 @@ jobs: USER_MESSAGE="${{ github.event.client_payload.script_args.user_message }}" DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}" PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}" + BROWSER="${{ github.event.client_payload.script_args.browser }}" RUN_ID="${{ github.event.client_payload.script_args.run_id }}" LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}" # Pass raw GitHub Actions object to Python - no parsing in bash @@ -282,7 +284,6 @@ jobs: [[ "${{ github.event.client_payload.script_args.no_vision }}" == "true" ]] && CMD_ARGS+=("--no-vision") [[ "$HEADLESS" == "true" ]] && CMD_ARGS+=("--headless") [[ "${{ github.event.client_payload.script_args.use_serp }}" == "true" ]] && CMD_ARGS+=("--use-serp") - [[ "${{ github.event.client_payload.script_args.use_anchor }}" == "true" ]] && CMD_ARGS+=("--use-anchor") [[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory") [[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output") [[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result") @@ -294,6 +295,7 @@ jobs: [[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE") [[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID") [[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL") + [[ -n "$BROWSER" ]] && CMD_ARGS+=("--browser" "$BROWSER") [[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID") [[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID") diff --git a/eval/service.py b/eval/service.py index e8421209b..20b0a3e03 100644 --- a/eval/service.py +++ b/eval/service.py @@ -81,9 +81,16 @@ load_dotenv() # Check for Anchor Browser API key ANCHOR_BROWSER_API_KEY = os.getenv('ANCHOR_BROWSER_API_KEY') if ANCHOR_BROWSER_API_KEY: - logger.info('ANCHOR_BROWSER_API_KEY is set. Tasks will use Anchor Browser.') + logger.info('ANCHOR_BROWSER_API_KEY is set. Tasks can use Anchor Browser.') else: - logger.warning('ANCHOR_BROWSER_API_KEY is not set. Tasks will use local browser.') + logger.warning('ANCHOR_BROWSER_API_KEY is not set. Anchor Browser will not be available.') + +# Check for Brightdata CDP URL +BRIGHTDATA_CDP_URL = os.getenv('BRIGHTDATA_CDP_URL') +if BRIGHTDATA_CDP_URL: + logger.info('BRIGHTDATA_CDP_URL is set. Tasks can use Brightdata browser.') +else: + logger.warning('BRIGHTDATA_CDP_URL is not set. Brightdata browser will not be available.') def create_anchor_browser_session(headless: bool = False) -> str: @@ -1441,27 +1448,43 @@ async def run_stage(stage: Stage, stage_func, timeout: int | None = None): async def setup_browser_session( - task: Task, headless: bool, highlight_elements: bool = True, use_anchor: bool = False + task: Task, headless: bool, highlight_elements: bool = True, browser: str = 'local' ) -> BrowserSession: """Setup browser session for the task""" - # Check for Anchor Browser API key and flag + # Validate browser option + valid_browsers = ['local', 'anchor-browser', 'brightdata', 'browser-use'] + if browser not in valid_browsers: + logger.warning(f'Browser setup: Invalid browser option "{browser}". Falling back to local browser.') + browser = 'local' + cdp_url = None - if use_anchor and ANCHOR_BROWSER_API_KEY: - try: - logger.debug(f'Browser setup: Creating Anchor Browser session for task {task.task_id}') - cdp_url = await asyncio.to_thread(create_anchor_browser_session, headless) - except Exception as e: - logger.error( - f'Browser setup: Failed to create Anchor Browser session for task {task.task_id}: {type(e).__name__}: {e}' + if browser == 'anchor-browser': + if ANCHOR_BROWSER_API_KEY: + try: + logger.debug(f'Browser setup: Creating Anchor Browser session for task {task.task_id}') + cdp_url = await asyncio.to_thread(create_anchor_browser_session, headless) + except Exception as e: + logger.error( + f'Browser setup: Failed to create Anchor Browser session for task {task.task_id}: {type(e).__name__}: {e}' + ) + logger.info(f'Browser setup: Falling back to local browser for task {task.task_id}') + cdp_url = None + else: + logger.warning( + f'Browser setup: Anchor Browser requested but ANCHOR_BROWSER_API_KEY not set. Using local browser for task {task.task_id}' ) - logger.info(f'Browser setup: Falling back to local browser for task {task.task_id}') - cdp_url = None - elif use_anchor and not ANCHOR_BROWSER_API_KEY: - logger.warning( - f'Browser setup: Anchor Browser requested but ANCHOR_BROWSER_API_KEY not set. Using local browser for task {task.task_id}' - ) + elif browser == 'brightdata': + if BRIGHTDATA_CDP_URL: + logger.debug(f'Browser setup: Using Brightdata CDP URL for task {task.task_id}') + cdp_url = BRIGHTDATA_CDP_URL + else: + logger.warning( + f'Browser setup: Brightdata requested but BRIGHTDATA_CDP_URL not set. Using local browser for task {task.task_id}' + ) + elif browser == 'browser-use': + logger.warning(f'Browser setup: Browser-use not implemented yet. Falling back to local browser for task {task.task_id}') profile_kwargs = { 'user_data_dir': None, # Incognito mode - no persistent state @@ -1843,7 +1866,7 @@ async def run_task_with_semaphore( semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument github_workflow_url: str | None = None, use_serp: bool = False, - use_anchor: bool = False, + browser: str = 'local', enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -1940,7 +1963,7 @@ async def run_task_with_semaphore( browser_session = await run_stage( Stage.SETUP_BROWSER, - lambda: setup_browser_session(task, headless, highlight_elements, use_anchor), + lambda: setup_browser_session(task, headless, highlight_elements, browser), timeout=120, ) task_result.stage_completed(Stage.SETUP_BROWSER) @@ -2248,7 +2271,7 @@ async def run_multiple_tasks( headless: bool = False, use_vision: bool = True, use_serp: bool = False, - use_anchor: bool = False, + browser: str = 'local', enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -2329,7 +2352,7 @@ async def run_multiple_tasks( semaphore_runs=semaphore_runs, # Pass the semaphore github_workflow_url=github_workflow_url, use_serp=use_serp, - use_anchor=use_anchor, + browser=browser, enable_memory=enable_memory, memory_interval=memory_interval, max_actions_per_step=max_actions_per_step, @@ -2689,7 +2712,7 @@ async def run_evaluation_pipeline( headless: bool = False, use_vision: bool = True, use_serp: bool = False, - use_anchor: bool = False, + browser: str = 'local', enable_memory: bool = False, memory_interval: int = 10, max_actions_per_step: int = 10, @@ -2744,7 +2767,7 @@ async def run_evaluation_pipeline( headless=headless, use_vision=use_vision, use_serp=use_serp, - use_anchor=use_anchor, + browser=browser, enable_memory=enable_memory, memory_interval=memory_interval, max_actions_per_step=max_actions_per_step, @@ -2877,7 +2900,12 @@ if __name__ == '__main__': parser.add_argument('--eval-group', type=str, default='', help='Evaluation group to include in the run') parser.add_argument('--developer-id', type=str, default=None, help='Name of the developer starting the run') parser.add_argument('--use-serp', action='store_true', help='Use SERP search instead of Google search') - parser.add_argument('--use-anchor', action='store_true', help='Use Anchor Browser (requires ANCHOR_BROWSER_API_KEY)') + parser.add_argument( + '--browser', + type=str, + default='local', + help='Browser to use: local, anchor-browser, brightdata, browser-use (default: local)', + ) parser.add_argument('--enable-memory', action='store_true', help='Enable mem0 memory system for agents') parser.add_argument('--memory-interval', type=int, default=10, help='Memory creation interval (default: 10 steps)') parser.add_argument('--max-actions-per-step', type=int, default=10, help='Maximum number of actions per step (default: 10)') @@ -3130,11 +3158,18 @@ if __name__ == '__main__': logger.info('🔍 Using default Google search') # Log browser mode being used - if args.use_anchor: + if args.browser == 'anchor-browser': if ANCHOR_BROWSER_API_KEY: logger.info('🌐 Using Anchor Browser (remote browser service)') else: - logger.warning('⚠️ --use-anchor flag provided but ANCHOR_BROWSER_API_KEY not set. Will use local browser!') + logger.warning('⚠️ --browser anchor-browser provided but ANCHOR_BROWSER_API_KEY not set. Will use local browser!') + elif args.browser == 'brightdata': + if BRIGHTDATA_CDP_URL: + logger.info('🌐 Using Brightdata browser (remote browser service)') + else: + logger.warning('⚠️ --browser brightdata provided but BRIGHTDATA_CDP_URL not set. Will use local browser!') + elif args.browser == 'browser-use': + logger.warning('🌐 Browser-use not implemented yet. Will use local browser!') else: logger.info('🌐 Using local browser') @@ -3235,7 +3270,7 @@ if __name__ == '__main__': headless=args.headless, use_vision=not args.no_vision, use_serp=args.use_serp, - use_anchor=args.use_anchor, + browser=args.browser, enable_memory=args.enable_memory, memory_interval=args.memory_interval, max_actions_per_step=args.max_actions_per_step, From 3dbaea1729c0fa4642d40f2c11c1cb8e6dc1b81a Mon Sep 17 00:00:00 2001 From: reformedot Date: Fri, 4 Jul 2025 23:43:15 +0200 Subject: [PATCH 14/14] fix: improved anchor browser session creation --- eval/service.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/eval/service.py b/eval/service.py index 20b0a3e03..547119a0d 100644 --- a/eval/service.py +++ b/eval/service.py @@ -96,8 +96,13 @@ else: def create_anchor_browser_session(headless: bool = False) -> str: """Create an Anchor Browser session and return CDP URL""" browser_configuration = { - 'session': {'proxy': {'type': 'anchor_residential', 'active': True}}, - 'browser': {'adblock': {'active': True}, 'captcha_solver': {'active': True}, 'headless': {'active': headless}}, + 'session': {'proxy': {'type': 'anchor_mobile', 'active': True, 'country_code': 'us'}}, + 'browser': { + 'adblock': {'active': True}, + 'captcha_solver': {'active': True}, + 'headless': {'active': headless}, + 'extra_stealth': {'active': True}, + }, } try: