Merge branch 'main' into fix-browser-error-recovery

This commit is contained in:
Magnus Müller
2025-07-05 10:26:21 +02:00
committed by GitHub
2 changed files with 304 additions and 15 deletions

View File

@@ -23,6 +23,7 @@ jobs:
EVALUATION_TOOL_URL: ${{ secrets.EVALUATION_TOOL_URL }}
EVALUATION_TOOL_SECRET_KEY: ${{ secrets.EVALUATION_TOOL_SECRET_KEY }}
ANCHOR_BROWSER_API_KEY: ${{ secrets.ANCHOR_BROWSER_API_KEY }}
BRIGHTDATA_CDP_URL: ${{ secrets.BRIGHTDATA_CDP_URL }}
SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }}
BROWSER_USE_LOGGING_LEVEL: ${{ secrets.BROWSER_USE_LOGGING_LEVEL }}
@@ -251,6 +252,7 @@ jobs:
USER_MESSAGE="${{ github.event.client_payload.script_args.user_message }}"
DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}"
PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}"
BROWSER="${{ github.event.client_payload.script_args.browser }}"
RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}"
# Pass raw GitHub Actions object to Python - no parsing in bash
@@ -282,7 +284,6 @@ jobs:
[[ "${{ github.event.client_payload.script_args.no_vision }}" == "true" ]] && CMD_ARGS+=("--no-vision")
[[ "$HEADLESS" == "true" ]] && CMD_ARGS+=("--headless")
[[ "${{ github.event.client_payload.script_args.use_serp }}" == "true" ]] && CMD_ARGS+=("--use-serp")
[[ "${{ github.event.client_payload.script_args.use_anchor }}" == "true" ]] && CMD_ARGS+=("--use-anchor")
[[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory")
[[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output")
[[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result")
@@ -294,6 +295,7 @@ jobs:
[[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE")
[[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID")
[[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL")
[[ -n "$BROWSER" ]] && CMD_ARGS+=("--browser" "$BROWSER")
[[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID")
[[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID")

View File

@@ -44,6 +44,7 @@ import base64
import io
import json
import logging
import os
import re
import signal
import sys
@@ -54,6 +55,8 @@ from uuid import UUID
import anyio
import psutil
import requests
from dotenv import load_dotenv
from lmnr import AsyncLaminarClient, Laminar, observe
from PIL import Image
from pydantic import BaseModel
@@ -71,6 +74,60 @@ MAX_IMAGE = 5
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s: %(message)s')
logger = logging.getLogger(__name__)
# Load dotenv
load_dotenv()
# Check for Anchor Browser API key
ANCHOR_BROWSER_API_KEY = os.getenv('ANCHOR_BROWSER_API_KEY')
if ANCHOR_BROWSER_API_KEY:
logger.info('ANCHOR_BROWSER_API_KEY is set. Tasks can use Anchor Browser.')
else:
logger.warning('ANCHOR_BROWSER_API_KEY is not set. Anchor Browser will not be available.')
# Check for Brightdata CDP URL
BRIGHTDATA_CDP_URL = os.getenv('BRIGHTDATA_CDP_URL')
if BRIGHTDATA_CDP_URL:
logger.info('BRIGHTDATA_CDP_URL is set. Tasks can use Brightdata browser.')
else:
logger.warning('BRIGHTDATA_CDP_URL is not set. Brightdata browser will not be available.')
def create_anchor_browser_session(headless: bool = False) -> str:
"""Create an Anchor Browser session and return CDP URL"""
browser_configuration = {
'session': {'proxy': {'type': 'anchor_mobile', 'active': True, 'country_code': 'us'}},
'browser': {
'adblock': {'active': True},
'captcha_solver': {'active': True},
'headless': {'active': headless},
'extra_stealth': {'active': True},
},
}
try:
response = requests.post(
'https://api.anchorbrowser.io/v1/sessions',
headers={
'anchor-api-key': ANCHOR_BROWSER_API_KEY,
'Content-Type': 'application/json',
},
json=browser_configuration,
)
response.raise_for_status()
session_data = response.json()['data']
session_id = session_data['id']
# Return only the CDP URL
return f'wss://connect.anchorbrowser.io?apiKey={ANCHOR_BROWSER_API_KEY}&sessionId={session_id}'
except requests.RequestException as e:
logger.error(f'Failed to create Anchor Browser session: {type(e).__name__}: {e}')
raise
except KeyError as e:
logger.error(f'Unexpected response format from Anchor Browser API: {e}')
raise
Laminar.initialize()
laminar_client = AsyncLaminarClient()
@@ -493,7 +550,6 @@ from dataclasses import dataclass, field
from enum import Enum
from typing import Any
import requests
from dotenv import load_dotenv
# Import the new comprehensive judge system (conditional import for backwards compatibility)
@@ -1181,6 +1237,7 @@ class Task:
self.login_type = kwargs.get('login_type', None)
self.category = kwargs.get('category', None)
self.output_schema = kwargs.get('output_schema', None) # Add structured output schema support
self.auth_keys = kwargs.get('auth_keys', None) # List of auth keys to fetch from auth distribution
if self.output_schema:
# Convert JSON schema to Pydantic model class
self.output_model = create_pydantic_model_from_schema(self.output_schema, f'Task_{self.task_id}_Output')
@@ -1197,6 +1254,7 @@ class Task:
'login_type',
'category',
'output_schema',
'auth_keys',
}
self.additional_fields = {k: v for k, v in kwargs.items() if k not in known_fields}
@@ -1206,7 +1264,7 @@ class Task:
def __str__(self):
# Include main fields and indicate if there are additional fields
base_str = f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level}, cluster_id={self.cluster_id}, login_cookie={self.login_cookie}, login_type={self.login_type}, category={self.category}, output_schema={self.output_schema}'
base_str = f'Task(task_id={self.task_id}, confirmed_task={self.confirmed_task}, website={self.website}, reference_length={self.reference_length}, level={self.level}, cluster_id={self.cluster_id}, login_cookie={self.login_cookie}, login_type={self.login_type}, category={self.category}, output_schema={self.output_schema}, auth_keys={self.auth_keys}'
if self.additional_fields:
additional_str = ', '.join(f'{k}={v}' for k, v in self.additional_fields.items())
base_str += f', {additional_str}'
@@ -1390,11 +1448,45 @@ async def run_stage(stage: Stage, stage_func, timeout: int | None = None):
return await stage_func()
async def setup_browser_session(task: Task, headless: bool, highlight_elements: bool = True) -> BrowserSession:
async def setup_browser_session(
task: Task, headless: bool, highlight_elements: bool = True, browser: str = 'local'
) -> BrowserSession:
"""Setup browser session for the task"""
logger.debug(f'Browser setup: Initializing BrowserSession for task {task.task_id}')
# Use incognito mode (user_data_dir=None) for evaluations to avoid state pollution
# Validate browser option
valid_browsers = ['local', 'anchor-browser', 'brightdata', 'browser-use']
if browser not in valid_browsers:
logger.warning(f'Browser setup: Invalid browser option "{browser}". Falling back to local browser.')
browser = 'local'
cdp_url = None
if browser == 'anchor-browser':
if ANCHOR_BROWSER_API_KEY:
try:
logger.debug(f'Browser setup: Creating Anchor Browser session for task {task.task_id}')
cdp_url = await asyncio.to_thread(create_anchor_browser_session, headless)
except Exception as e:
logger.error(
f'Browser setup: Failed to create Anchor Browser session for task {task.task_id}: {type(e).__name__}: {e}'
)
logger.info(f'Browser setup: Falling back to local browser for task {task.task_id}')
cdp_url = None
else:
logger.warning(
f'Browser setup: Anchor Browser requested but ANCHOR_BROWSER_API_KEY not set. Using local browser for task {task.task_id}'
)
elif browser == 'brightdata':
if BRIGHTDATA_CDP_URL:
logger.debug(f'Browser setup: Using Brightdata CDP URL for task {task.task_id}')
cdp_url = BRIGHTDATA_CDP_URL
else:
logger.warning(
f'Browser setup: Brightdata requested but BRIGHTDATA_CDP_URL not set. Using local browser for task {task.task_id}'
)
elif browser == 'browser-use':
logger.warning(f'Browser setup: Browser-use not implemented yet. Falling back to local browser for task {task.task_id}')
profile_kwargs = {
'user_data_dir': None, # Incognito mode - no persistent state
'headless': headless,
@@ -1433,10 +1525,16 @@ async def setup_browser_session(task: Task, headless: bool, highlight_elements:
logger.debug(f'Login task {task.task_id}: Configured to save cookies to {storage_state_path}')
profile = BrowserProfile(**profile_kwargs)
browser_session = BrowserSession(browser_profile=profile)
if cdp_url:
logger.debug(f'Browser setup: Using CDP Browser for task {task.task_id}')
browser_session = BrowserSession(browser_profile=profile, cdp_url=cdp_url)
else:
# Use local browser
logger.debug(f'Browser setup: Initializing BrowserSession for task {task.task_id}')
browser_session = BrowserSession(browser_profile=profile)
# Start browser session
logger.debug(f'Browser setup: Starting browser session for task {task.task_id}')
await browser_session.start()
logger.debug(f'Browser setup: Browser session started for task {task.task_id}')
@@ -1570,11 +1668,11 @@ async def evaluate_task_with_login_cookie(login_cookie: str, task_folder: Path)
If not found in tracking, falls back to checking end-state cookies.
Args:
login_cookie: String identifier that should appear in cookies if login was successful
task_folder: Path to the task result folder containing saved cookies
login_cookie: String identifier that should appear in cookies if login was successful
task_folder: Path to the task result folder containing saved cookies
Returns:
Dictionary containing evaluation results similar to Online_Mind2Web_eval format
Dictionary containing evaluation results similar to Online_Mind2Web_eval format
"""
task_id = task_folder.name
@@ -1767,8 +1865,10 @@ async def run_task_with_semaphore(
headless: bool,
use_vision: bool,
semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument
auth_distribution: dict | None = None, # Pre-fetched auth distribution
github_workflow_url: str | None = None,
use_serp: bool = False,
browser: str = 'local',
enable_memory: bool = False,
memory_interval: int = 10,
max_actions_per_step: int = 10,
@@ -1864,7 +1964,9 @@ async def run_task_with_semaphore(
)
browser_session = await run_stage(
Stage.SETUP_BROWSER, lambda: setup_browser_session(task, headless, highlight_elements), timeout=120
Stage.SETUP_BROWSER,
lambda: setup_browser_session(task, headless, highlight_elements, browser),
timeout=120,
)
task_result.stage_completed(Stage.SETUP_BROWSER)
logger.info(f'Task {task.task_id}: Browser session started successfully.')
@@ -1892,6 +1994,38 @@ async def run_task_with_semaphore(
convex_url, secret_key, run_id, task.task_id, 'run_agent', 'active', github_workflow_url
)
# Handle auth information if task requires it
task_with_auth = task
if hasattr(task, 'auth_keys') and task.auth_keys:
# Validate auth_keys is a list
if isinstance(task.auth_keys, list) and len(task.auth_keys) > 0:
if auth_distribution:
logger.info(
f'Task {task.task_id}: Using pre-fetched auth distribution for auth_keys: {task.auth_keys}'
)
auth_info_text = format_auth_info_for_agent(auth_distribution, task.auth_keys)
if auth_info_text:
# Create a modified task with auth info appended
class TaskWithAuth(Task):
def __init__(self, original_task: Task, auth_text: str):
# Copy all attributes from original task
for attr_name in dir(original_task):
if not attr_name.startswith('__'):
setattr(self, attr_name, getattr(original_task, attr_name))
# Modify the confirmed_task to include auth info
self.confirmed_task = original_task.confirmed_task + auth_text
task_with_auth = TaskWithAuth(task, auth_info_text)
logger.info(f'Task {task.task_id}: Auth info added to task description')
else:
logger.warning(
f'Task {task.task_id}: No matching auth info found for keys: {task.auth_keys}'
)
else:
logger.warning(f'Task {task.task_id}: Auth keys specified but no auth distribution available')
else:
logger.warning(f'Task {task.task_id}: auth_keys is not a valid list: {task.auth_keys}')
# Start timing for agent execution only
agent_start_time = time.time()
@@ -1899,7 +2033,7 @@ async def run_task_with_semaphore(
Stage.RUN_AGENT,
lambda: run_agent_with_browser(
browser_session,
task,
task_with_auth,
llm,
max_steps_per_task,
use_vision,
@@ -2163,6 +2297,7 @@ async def run_multiple_tasks(
convex_url: str,
secret_key: str,
eval_model: BaseChatModel,
auth_distribution: dict | None = None,
github_workflow_url: str | None = None,
max_parallel_runs: int = 3,
max_steps_per_task: int = 25,
@@ -2171,6 +2306,7 @@ async def run_multiple_tasks(
headless: bool = False,
use_vision: bool = True,
use_serp: bool = False,
browser: str = 'local',
enable_memory: bool = False,
memory_interval: int = 10,
max_actions_per_step: int = 10,
@@ -2249,8 +2385,10 @@ async def run_multiple_tasks(
headless=headless,
use_vision=use_vision,
semaphore_runs=semaphore_runs, # Pass the semaphore
auth_distribution=auth_distribution, # Pass the pre-fetched auth distribution
github_workflow_url=github_workflow_url,
use_serp=use_serp,
browser=browser,
enable_memory=enable_memory,
memory_interval=memory_interval,
max_actions_per_step=max_actions_per_step,
@@ -2373,6 +2511,109 @@ def fetch_tasks_from_server(convex_url: str, secret_key: str, test_case_name: st
return None
# Helper function to fetch auth distribution from the server
def fetch_auth_distribution_from_server(convex_url: str, secret_key: str):
"""Fetches an available auth distribution from the Convex HTTP endpoint."""
if not convex_url:
logger.error('Error: EVALUATION_TOOL_URL environment variable not set.')
return None
if not secret_key:
logger.error('Error: EVALUATION_TOOL_SECRET_KEY environment variable not set.')
return None
endpoint_url = f'{convex_url}/api/getAuthDistribution'
headers = {
'Authorization': f'Bearer {secret_key}',
'Content-Type': 'application/json',
}
logger.info(f'Fetching auth distribution from {endpoint_url}...')
try:
response = requests.post(endpoint_url, headers=headers, json={})
logger.info(f'Fetch Auth Distribution Status Code: {response.status_code}')
if response.status_code == 200:
try:
data = response.json()
logger.info('Successfully fetched auth distribution data.')
# Verify the response has the expected structure
if isinstance(data, dict) and 'id' in data and 'loginInfo' in data:
return data
else:
logger.error(
f'Error: Fetched auth distribution data has unexpected structure. Keys: {list(data.keys()) if isinstance(data, dict) else "Not a dict"}'
)
logger.error(f'Raw response: {response.text}')
return None
except json.JSONDecodeError:
logger.error('Error: Failed to decode JSON response for auth distribution.')
logger.error(f'Raw response text: {response.text}')
return None
elif response.status_code == 404:
logger.warning('No available auth distribution found on server.')
return None
else:
logger.error(f'Error: Failed to fetch auth distribution. Status: {response.status_code}')
logger.error(f'Response: {response.text}')
return None
except requests.exceptions.RequestException as e:
logger.error(f'Error during request to fetch auth distribution: {type(e).__name__}: {e}')
return None
# Helper function to format auth information for the agent
def format_auth_info_for_agent(auth_distribution: dict, auth_keys: list[str]) -> str:
"""
Formats auth information from auth distribution for the agent task description.
Args:
auth_distribution: Dict with 'loginInfo' key containing auth data
auth_keys: List of auth keys to extract (e.g., ['google', 'facebook'])
Returns:
Formatted string with login credentials or empty string if no matching keys
"""
if not auth_distribution or not auth_keys:
return ''
login_info = auth_distribution.get('loginInfo', {})
if not login_info:
logger.warning('Auth distribution has no loginInfo')
return ''
# Extract relevant auth information based on auth_keys
relevant_auths = []
for auth_key in auth_keys:
if auth_key in login_info:
auth_data = login_info[auth_key]
if isinstance(auth_data, dict):
# Format the auth data for this key
auth_details = []
for key, value in auth_data.items():
auth_details.append(f'{key}: {value}')
if auth_details:
relevant_auths.append(f'{auth_key} with {", ".join(auth_details)}')
else:
logger.warning(f"Auth data for key '{auth_key}' is not a dictionary: {type(auth_data)}")
else:
logger.warning(f"Auth key '{auth_key}' not found in available login info. Available keys: {list(login_info.keys())}")
if relevant_auths:
auth_text = f'\n\nThe following login credentials can be used to complete this task: {"; ".join(relevant_auths)}.'
logger.info(f'Formatted auth info: {auth_text}')
return auth_text
else:
logger.warning(f'No matching auth keys found. Requested: {auth_keys}, Available: {list(login_info.keys())}')
return ''
# Helper function to get git information
def get_git_info():
"""Retrieves git branch, commit hash, commit timestamp, and repository URL using subprocess."""
@@ -2602,6 +2843,7 @@ async def run_evaluation_pipeline(
convex_url: str,
secret_key: str,
eval_model: BaseChatModel,
auth_distribution: dict | None = None,
github_workflow_url: str | None = None,
max_parallel_runs: int = 3,
max_steps_per_task: int = 25,
@@ -2610,6 +2852,7 @@ async def run_evaluation_pipeline(
headless: bool = False,
use_vision: bool = True,
use_serp: bool = False,
browser: str = 'local',
enable_memory: bool = False,
memory_interval: int = 10,
max_actions_per_step: int = 10,
@@ -2656,6 +2899,7 @@ async def run_evaluation_pipeline(
convex_url=convex_url,
secret_key=secret_key,
eval_model=eval_model,
auth_distribution=auth_distribution,
github_workflow_url=github_workflow_url,
max_parallel_runs=max_parallel_runs,
max_steps_per_task=max_steps_per_task,
@@ -2664,6 +2908,7 @@ async def run_evaluation_pipeline(
headless=headless,
use_vision=use_vision,
use_serp=use_serp,
browser=browser,
enable_memory=enable_memory,
memory_interval=memory_interval,
max_actions_per_step=max_actions_per_step,
@@ -2796,6 +3041,12 @@ if __name__ == '__main__':
parser.add_argument('--eval-group', type=str, default='', help='Evaluation group to include in the run')
parser.add_argument('--developer-id', type=str, default=None, help='Name of the developer starting the run')
parser.add_argument('--use-serp', action='store_true', help='Use SERP search instead of Google search')
parser.add_argument(
'--browser',
type=str,
default='local',
help='Browser to use: local, anchor-browser, brightdata, browser-use (default: local)',
)
parser.add_argument('--enable-memory', action='store_true', help='Enable mem0 memory system for agents')
parser.add_argument('--memory-interval', type=int, default=10, help='Memory creation interval (default: 10 steps)')
parser.add_argument('--max-actions-per-step', type=int, default=10, help='Maximum number of actions per step (default: 10)')
@@ -2837,7 +3088,6 @@ if __name__ == '__main__':
)
parser.add_argument('--use-mind2web-judge', action='store_true', help='Use original judge')
parser.add_argument('--no-thinking', action='store_true', help='Disable thinking in agent system prompt')
parser.add_argument('--use-anchor', action='store_true', help='Use anchor to navigate to the page')
parser.add_argument('--github-workflow-url', type=str, default=None, help='GitHub workflow URL for tracking')
# Gmail 2FA support arguments
@@ -2934,6 +3184,7 @@ if __name__ == '__main__':
# --- Load Tasks (Either Single Task or from Server) ---
tasks = []
task_id = None # Initialize for proper scoping
auth_distribution = None # Initialize auth distribution
# Check if this is single task mode
if args.task_text:
@@ -2968,10 +3219,28 @@ if __name__ == '__main__':
logger.info(f'Successfully loaded {len(tasks)} tasks from the server.')
except (TypeError, ValueError) as e:
logger.error(
f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}'
f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category, auth_keys. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}'
)
logger.error(f'First item in fetched data: {fetched_task_data[0] if fetched_task_data else "None"}')
exit(1)
# --- Fetch Auth Distribution Once (if any tasks need auth) ---
tasks_with_auth = [
task
for task in tasks
if hasattr(task, 'auth_keys') and task.auth_keys and isinstance(task.auth_keys, list) and len(task.auth_keys) > 0
]
if tasks_with_auth and CONVEX_URL and SECRET_KEY:
logger.info(f'Found {len(tasks_with_auth)} tasks requiring auth. Fetching auth distribution...')
auth_distribution = fetch_auth_distribution_from_server(CONVEX_URL, SECRET_KEY)
if auth_distribution:
logger.info(
f'Successfully fetched auth distribution with login info for: {list(auth_distribution.get("loginInfo", {}).keys())}'
)
else:
logger.warning('Failed to fetch auth distribution. Tasks requiring auth may fail.')
elif tasks_with_auth:
logger.warning(f'Found {len(tasks_with_auth)} tasks requiring auth but no server config available')
# -----------------------------
# --- Start Run on Server (with optional existing Run ID) ---
@@ -3048,6 +3317,22 @@ if __name__ == '__main__':
else:
logger.info('🔍 Using default Google search')
# Log browser mode being used
if args.browser == 'anchor-browser':
if ANCHOR_BROWSER_API_KEY:
logger.info('🌐 Using Anchor Browser (remote browser service)')
else:
logger.warning('⚠️ --browser anchor-browser provided but ANCHOR_BROWSER_API_KEY not set. Will use local browser!')
elif args.browser == 'brightdata':
if BRIGHTDATA_CDP_URL:
logger.info('🌐 Using Brightdata browser (remote browser service)')
else:
logger.warning('⚠️ --browser brightdata provided but BRIGHTDATA_CDP_URL not set. Will use local browser!')
elif args.browser == 'browser-use':
logger.warning('🌐 Browser-use not implemented yet. Will use local browser!')
else:
logger.info('🌐 Using local browser')
# Log memory configuration
if args.enable_memory:
logger.info(f'🧠 Memory enabled: mem0 system with interval={args.memory_interval} steps')
@@ -3137,6 +3422,7 @@ if __name__ == '__main__':
convex_url=convex_url,
secret_key=secret_key,
eval_model=eval_model,
auth_distribution=auth_distribution,
github_workflow_url=args.github_workflow_url,
max_parallel_runs=parallel_runs,
max_steps_per_task=args.max_steps,
@@ -3145,6 +3431,7 @@ if __name__ == '__main__':
headless=args.headless,
use_vision=not args.no_vision,
use_serp=args.use_serp,
browser=args.browser,
enable_memory=args.enable_memory,
memory_interval=args.memory_interval,
max_actions_per_step=args.max_actions_per_step,