Files
browser-use/tests/test_mind2web.py
Gregor Žunič 89c63fdd63 Added custom actions registry and fixed extraction layer (#20)
* Validator

* Test mind2web

* Cleaned up logger

* Pytest logger

* Cleaned up logger

* Disable flag for human input

* Multiple clicks per button

* Multiple clicks per button

* More structured system prompt

* Fields with description

* System prompt example

* One logger

* Cleaner logging

* Log step in step function

* Fix critical clicking error - wrong argument used

* Improved thought process of agent

* Improve system prompt

* Remove human input message

* Custome action registration

* Pydantic model for custom actions

* Pydantic model for custome output

* Runs through, model outputs functions, but not called yet

* Work in progress - description for custome actions

* Description works, but schema not yet

* Model can call the right action - but is not executed

* Seperate is_controller_action  and is_custom_action

* Works! Model can call custom function

* Use registry for action, but result is not feed back to model

* Include result in messages

* Works with custom function - but typing is not correct

* Renamed registry

* First test cases

* Captcha tests

* Pydantic for tests

* Improve prompts for multy step

* System prompt structure

* Handle errors like validation error

* Refactor error handling in agent

* Refactor error handling in agent

* Improved logging

* Update view

* Fix click parameter to index

* Simplify dynamic actions

* Use run instead of step

* Rename history

* Rename AgentService to Agent

* Rename ControllerService to Controller

* Pytest file

* Rename get state

* Rename BrowserService

* reversed dom extraction recursion to while

* Rename use_vision

* Rename use_vision

* reversed dom tree items and made browser less anoying

* Renaming and fixing type errors

* Renamed class names for agent

* updated requirements

* Update prompt

* Action registration works for user and controller

* Fix done call by returning ActionResult

* Fix if result is none

* Rename AgentOutput and ActionModel

* Improved prompt Passes 6/8 tests from test_agent_actions

* Calculate token cost

* Improve display

* Simplified logger

* Test function calling

* created super simple xpath extraction algo

* Tests logging

* tiny fixes to dom extraction

* Remove test

* Dont log number of clicks

* Pytest file

* merged per element js checks

* Check if driver is still open

* super fast processing

* fixed agent planning and stuff

* Fix example

* Fix example

* Improve error

* Improved error correction

* New line for step

* small type error fixes

* Test for pydantic

* Fix line

* Removed sample

* fixed readme and examples

---------

Co-authored-by: magmueller <mamagnus00@gmail.com>
2024-11-15 21:42:02 +01:00

117 lines
3.1 KiB
Python

"""
Test browser automation using Mind2Web dataset tasks with pytest framework.
"""
import json
import logging
import os
from typing import Any, Dict, List
import pytest
from langchain_openai import ChatOpenAI
from browser_use.agent.service import Agent
from browser_use.controller.service import Controller
from browser_use.utils import logger
# Constants
MAX_STEPS = 50
TEST_SUBSET_SIZE = 10
@pytest.fixture(scope='session')
def test_cases() -> List[Dict[str, Any]]:
"""Load test cases from Mind2Web dataset"""
file_path = os.path.join(os.path.dirname(__file__), 'mind2web_data/processed.json')
logger.info(f'Loading test cases from {file_path}')
with open(file_path, 'r') as f:
data = json.load(f)
subset = data[:TEST_SUBSET_SIZE]
logger.info(f'Loaded {len(subset)}/{len(data)} test cases')
return subset
@pytest.fixture(scope='session')
def llm():
"""Initialize the language model"""
return ChatOpenAI(model='gpt-4o')
@pytest.fixture(scope='function')
async def controller():
"""Initialize the controller"""
controller = Controller()
try:
yield controller
finally:
if controller.browser:
controller.browser.close(force=True)
# run with: pytest -s -v tests/test_mind2web.py:test_random_samples
@pytest.mark.asyncio
async def test_random_samples(test_cases: List[Dict[str, Any]], llm, controller, validator):
"""Test a random sampling of tasks across different websites"""
import random
logger.info('=== Testing Random Samples ===')
# Take random samples
samples = random.sample(test_cases, 1)
for i, case in enumerate(samples, 1):
task = f"Go to {case['website']}.com and {case['confirmed_task']}"
logger.info(f'--- Random Sample {i}/{len(samples)} ---')
logger.info(f'Task: {task}\n')
agent = Agent(task, llm, controller)
await agent.run()
logger.info('Validating random sample task...')
# TODO: Validate the task
def test_dataset_integrity(test_cases):
"""Test the integrity of the test dataset"""
logger.info('\n=== Testing Dataset Integrity ===')
required_fields = ['website', 'confirmed_task', 'action_reprs']
missing_fields = []
logger.info(f'Checking {len(test_cases)} test cases for required fields')
for i, case in enumerate(test_cases, 1):
logger.debug(f'Checking case {i}/{len(test_cases)}')
for field in required_fields:
if field not in case:
missing_fields.append(f'Case {i}: {field}')
logger.warning(f"Missing field '{field}' in case {i}")
# Type checks
if not isinstance(case.get('confirmed_task'), str):
logger.error(f"Case {i}: 'confirmed_task' must be string")
assert False, 'Task must be string'
if not isinstance(case.get('action_reprs'), list):
logger.error(f"Case {i}: 'action_reprs' must be list")
assert False, 'Actions must be list'
if len(case.get('action_reprs', [])) == 0:
logger.error(f"Case {i}: 'action_reprs' must not be empty")
assert False, 'Must have at least one action'
if missing_fields:
logger.error('Dataset integrity check failed')
assert False, f'Missing fields: {missing_fields}'
else:
logger.info('✅ Dataset integrity check passed')
if __name__ == '__main__':
pytest.main([__file__, '-v'])