Files
browser-use/tests/ci/test_structured_extraction.py
2026-02-01 20:26:50 -08:00

632 lines
20 KiB
Python

"""Tests for schema-enforced structured extraction."""
import asyncio
import json
import tempfile
from unittest.mock import AsyncMock
import pytest
from pydantic import ValidationError
from pytest_httpserver import HTTPServer
from browser_use.agent.views import ActionResult
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.filesystem.file_system import FileSystem
from browser_use.llm.base import BaseChatModel
from browser_use.llm.views import ChatInvokeCompletion
from browser_use.tools.extraction.schema_utils import schema_dict_to_pydantic_model
from browser_use.tools.extraction.views import ExtractionResult
from browser_use.tools.service import Tools
# ---------------------------------------------------------------------------
# Unit tests: schema_dict_to_pydantic_model
# ---------------------------------------------------------------------------
class TestSchemaDictToPydanticModel:
"""Unit tests for the JSON-Schema → Pydantic model converter."""
def test_flat_object(self):
schema = {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'age': {'type': 'integer'},
},
'required': ['name', 'age'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(name='Alice', age=30)
assert instance.name == 'Alice' # type: ignore[attr-defined]
assert instance.age == 30 # type: ignore[attr-defined]
def test_nested_object(self):
schema = {
'type': 'object',
'properties': {
'person': {
'type': 'object',
'properties': {
'first': {'type': 'string'},
'last': {'type': 'string'},
},
'required': ['first'],
},
},
'required': ['person'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(person={'first': 'Bob', 'last': 'Smith'})
assert instance.person.first == 'Bob' # type: ignore[attr-defined]
def test_array_of_objects(self):
schema = {
'type': 'object',
'properties': {
'items': {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'id': {'type': 'integer'},
'label': {'type': 'string'},
},
'required': ['id', 'label'],
},
},
},
'required': ['items'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(items=[{'id': 1, 'label': 'a'}, {'id': 2, 'label': 'b'}])
assert len(instance.items) == 2 # type: ignore[attr-defined]
assert instance.items[0].id == 1 # type: ignore[attr-defined]
def test_array_of_primitives(self):
schema = {
'type': 'object',
'properties': {
'tags': {'type': 'array', 'items': {'type': 'string'}},
},
'required': ['tags'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(tags=['a', 'b', 'c'])
assert instance.tags == ['a', 'b', 'c'] # type: ignore[attr-defined]
def test_enum_field(self):
schema = {
'type': 'object',
'properties': {
'status': {'type': 'string', 'enum': ['active', 'inactive']},
},
'required': ['status'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(status='active')
assert instance.status == 'active' # type: ignore[attr-defined]
def test_optional_enum_defaults_to_none(self):
"""Non-required enum fields default to None, not an out-of-set empty string."""
schema = {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'priority': {'type': 'string', 'enum': ['low', 'medium', 'high']},
},
'required': ['name'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(name='task1')
assert instance.priority is None # type: ignore[attr-defined]
# Serialized output must not contain an out-of-set value
dumped = instance.model_dump(mode='json')
assert dumped['priority'] is None
# When provided, value still works
instance2 = Model(name='task2', priority='high')
assert instance2.priority == 'high' # type: ignore[attr-defined]
def test_optional_fields_get_type_appropriate_defaults(self):
schema = {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'nickname': {'type': 'string'},
'score': {'type': 'number'},
'rank': {'type': 'integer'},
'active': {'type': 'boolean'},
'tags': {'type': 'array', 'items': {'type': 'string'}},
},
'required': ['name'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(name='Alice')
assert instance.name == 'Alice' # type: ignore[attr-defined]
assert instance.nickname == '' # type: ignore[attr-defined]
assert instance.score == 0.0 # type: ignore[attr-defined]
assert instance.rank == 0 # type: ignore[attr-defined]
assert instance.active is False # type: ignore[attr-defined]
assert instance.tags == [] # type: ignore[attr-defined]
def test_optional_non_nullable_rejects_null(self):
"""Non-required fields that aren't nullable must reject explicit null."""
schema = {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'nickname': {'type': 'string'},
},
'required': ['name'],
}
Model = schema_dict_to_pydantic_model(schema)
with pytest.raises(ValidationError):
Model(name='Alice', nickname=None)
def test_optional_with_explicit_default(self):
schema = {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'color': {'type': 'string', 'default': 'blue'},
},
'required': ['name'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(name='Alice')
assert instance.color == 'blue' # type: ignore[attr-defined]
def test_optional_nested_object_defaults_to_none(self):
"""Non-required nested objects fall back to None since constructing a default is not feasible."""
schema = {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'address': {
'type': 'object',
'properties': {'city': {'type': 'string'}},
'required': ['city'],
},
},
'required': ['name'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(name='Alice')
assert instance.address is None # type: ignore[attr-defined]
def test_model_name_from_title(self):
schema = {
'title': 'ProductInfo',
'type': 'object',
'properties': {'sku': {'type': 'string'}},
'required': ['sku'],
}
Model = schema_dict_to_pydantic_model(schema)
assert Model.__name__ == 'ProductInfo'
def test_model_validate_json_roundtrip(self):
schema = {
'type': 'object',
'properties': {
'x': {'type': 'number'},
'y': {'type': 'boolean'},
},
'required': ['x', 'y'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(x=3.14, y=True)
raw = instance.model_dump_json()
restored = Model.model_validate_json(raw)
assert restored.x == instance.x # type: ignore[attr-defined]
assert restored.y == instance.y # type: ignore[attr-defined]
def test_rejects_ref(self):
schema = {
'type': 'object',
'properties': {'item': {'$ref': '#/$defs/Item'}},
'$defs': {'Item': {'type': 'object', 'properties': {'name': {'type': 'string'}}}},
}
with pytest.raises(ValueError, match='Unsupported JSON Schema keyword'):
schema_dict_to_pydantic_model(schema)
def test_rejects_allOf(self):
schema = {
'type': 'object',
'properties': {'x': {'allOf': [{'type': 'string'}]}},
}
with pytest.raises(ValueError, match='Unsupported JSON Schema keyword'):
schema_dict_to_pydantic_model(schema)
def test_rejects_non_object_toplevel(self):
with pytest.raises(ValueError, match='type "object"'):
schema_dict_to_pydantic_model({'type': 'array', 'items': {'type': 'string'}})
def test_rejects_empty_properties(self):
with pytest.raises(ValueError, match='at least one property'):
schema_dict_to_pydantic_model({'type': 'object', 'properties': {}})
def test_extra_fields_forbidden(self):
schema = {
'type': 'object',
'properties': {'name': {'type': 'string'}},
'required': ['name'],
}
Model = schema_dict_to_pydantic_model(schema)
with pytest.raises(ValidationError):
Model(name='ok', bogus='nope')
def test_nullable_field(self):
schema = {
'type': 'object',
'properties': {
'value': {'type': 'string', 'nullable': True},
},
'required': ['value'],
}
Model = schema_dict_to_pydantic_model(schema)
instance = Model(value=None)
assert instance.value is None # type: ignore[attr-defined]
def test_field_descriptions_preserved(self):
schema = {
'type': 'object',
'properties': {
'price': {'type': 'number', 'description': 'The price in USD'},
},
'required': ['price'],
}
Model = schema_dict_to_pydantic_model(schema)
field_info = Model.model_fields['price']
assert field_info.description == 'The price in USD'
# ---------------------------------------------------------------------------
# Unit tests: ExtractionResult
# ---------------------------------------------------------------------------
class TestExtractionResult:
def test_construction(self):
er = ExtractionResult(
data={'name': 'Alice'},
schema_used={'type': 'object', 'properties': {'name': {'type': 'string'}}},
)
assert er.data == {'name': 'Alice'}
assert er.is_partial is False
assert er.source_url is None
def test_serialization_roundtrip(self):
er = ExtractionResult(
data={'items': [1, 2]},
schema_used={'type': 'object', 'properties': {'items': {'type': 'array'}}},
is_partial=True,
source_url='http://example.com',
content_stats={'original_html_chars': 5000},
)
raw = er.model_dump_json()
restored = ExtractionResult.model_validate_json(raw)
assert restored == er
# ---------------------------------------------------------------------------
# Integration tests: extract action via Tools
# ---------------------------------------------------------------------------
def _make_extraction_llm(structured_response: dict | None = None, freetext_response: str = 'free text result') -> BaseChatModel:
"""Create a mock LLM that handles both structured and freetext extraction calls."""
llm = AsyncMock(spec=BaseChatModel)
llm.model = 'mock-extraction-llm'
llm._verified_api_keys = True
llm.provider = 'mock'
llm.name = 'mock-extraction-llm'
llm.model_name = 'mock-extraction-llm'
async def mock_ainvoke(messages, output_format=None, **kwargs):
if output_format is not None and structured_response is not None:
# Structured path: parse the dict through the model
instance = output_format.model_validate(structured_response)
return ChatInvokeCompletion(completion=instance, usage=None)
# Freetext path
return ChatInvokeCompletion(completion=freetext_response, usage=None)
llm.ainvoke.side_effect = mock_ainvoke
return llm
@pytest.fixture(scope='module')
async def browser_session():
session = BrowserSession(browser_profile=BrowserProfile(headless=True, user_data_dir=None, keep_alive=True))
await session.start()
yield session
await session.kill()
await session.event_bus.stop(clear=True, timeout=5)
@pytest.fixture(scope='session')
def http_server():
server = HTTPServer()
server.start()
server.expect_request('/products').respond_with_data(
"""<html><body>
<h1>Products</h1>
<ul>
<li>Widget A - $9.99</li>
<li>Widget B - $19.99</li>
</ul>
</body></html>""",
content_type='text/html',
)
yield server
server.stop()
@pytest.fixture(scope='session')
def base_url(http_server):
return f'http://{http_server.host}:{http_server.port}'
class TestExtractStructured:
"""Integration tests for the extract action's structured extraction path."""
async def test_structured_extraction_returns_json(self, browser_session, base_url):
"""When output_schema is provided, extract returns structured JSON in <structured_result> tags."""
tools = Tools()
await tools.navigate(url=f'{base_url}/products', new_tab=False, browser_session=browser_session)
await asyncio.sleep(0.5)
output_schema = {
'type': 'object',
'properties': {
'products': {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'price': {'type': 'number'},
},
'required': ['name', 'price'],
},
},
},
'required': ['products'],
}
mock_data = {'products': [{'name': 'Widget A', 'price': 9.99}, {'name': 'Widget B', 'price': 19.99}]}
extraction_llm = _make_extraction_llm(structured_response=mock_data)
with tempfile.TemporaryDirectory() as tmp:
fs = FileSystem(tmp)
result = await tools.extract(
query='List all products with prices',
output_schema=output_schema,
browser_session=browser_session,
page_extraction_llm=extraction_llm,
file_system=fs,
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert '<structured_result>' in result.extracted_content
assert '</structured_result>' in result.extracted_content
# Parse the JSON out of the tags
start = result.extracted_content.index('<structured_result>') + len('<structured_result>')
end = result.extracted_content.index('</structured_result>')
parsed = json.loads(result.extracted_content[start:end].strip())
assert parsed == mock_data
# Metadata
assert result.metadata is not None
assert result.metadata['structured_extraction'] is True
meta = result.metadata['extraction_result']
assert meta['data'] == mock_data
assert meta['schema_used'] == output_schema
async def test_freetext_extraction_unchanged(self, browser_session, base_url):
"""When output_schema is None, extract returns free-text in <result> tags (backward compat)."""
tools = Tools()
await tools.navigate(url=f'{base_url}/products', new_tab=False, browser_session=browser_session)
await asyncio.sleep(0.5)
extraction_llm = _make_extraction_llm(freetext_response='Widget A costs $9.99, Widget B costs $19.99')
with tempfile.TemporaryDirectory() as tmp:
fs = FileSystem(tmp)
result = await tools.extract(
query='What products are listed?',
browser_session=browser_session,
page_extraction_llm=extraction_llm,
file_system=fs,
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert '<result>' in result.extracted_content
assert '</result>' in result.extracted_content
assert '<structured_result>' not in result.extracted_content
assert result.metadata is None
async def test_invalid_schema_falls_back_to_freetext(self, browser_session, base_url):
"""When output_schema contains unsupported keywords, fall back to free-text gracefully."""
tools = Tools()
await tools.navigate(url=f'{base_url}/products', new_tab=False, browser_session=browser_session)
await asyncio.sleep(0.5)
bad_schema = {
'type': 'object',
'properties': {'item': {'$ref': '#/$defs/Item'}},
'$defs': {'Item': {'type': 'object', 'properties': {'name': {'type': 'string'}}}},
}
extraction_llm = _make_extraction_llm(freetext_response='fallback text')
with tempfile.TemporaryDirectory() as tmp:
fs = FileSystem(tmp)
result = await tools.extract(
query='Get products',
output_schema=bad_schema,
browser_session=browser_session,
page_extraction_llm=extraction_llm,
file_system=fs,
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
# Should have used the free-text path
assert '<result>' in result.extracted_content
assert '<structured_result>' not in result.extracted_content
assert result.metadata is None
# ---------------------------------------------------------------------------
# Integration tests: extraction_schema injection via special parameter
# ---------------------------------------------------------------------------
PRODUCT_SCHEMA = {
'type': 'object',
'properties': {
'products': {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'name': {'type': 'string'},
'price': {'type': 'number'},
},
'required': ['name', 'price'],
},
},
},
'required': ['products'],
}
MOCK_PRODUCTS = {'products': [{'name': 'Widget A', 'price': 9.99}, {'name': 'Widget B', 'price': 19.99}]}
class TestExtractionSchemaInjection:
"""Tests that extraction_schema injected as a special parameter triggers structured extraction."""
async def test_injected_extraction_schema_triggers_structured_path(self, browser_session, base_url):
"""extraction_schema passed via act() triggers structured extraction even without output_schema in params."""
tools = Tools()
await tools.navigate(url=f'{base_url}/products', new_tab=False, browser_session=browser_session)
await asyncio.sleep(0.5)
extraction_llm = _make_extraction_llm(structured_response=MOCK_PRODUCTS)
with tempfile.TemporaryDirectory() as tmp:
fs = FileSystem(tmp)
result = await tools.extract(
query='List all products with prices',
browser_session=browser_session,
page_extraction_llm=extraction_llm,
file_system=fs,
extraction_schema=PRODUCT_SCHEMA,
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert '<structured_result>' in result.extracted_content
# Parse and verify JSON
start = result.extracted_content.index('<structured_result>') + len('<structured_result>')
end = result.extracted_content.index('</structured_result>')
parsed = json.loads(result.extracted_content[start:end].strip())
assert parsed == MOCK_PRODUCTS
assert result.metadata is not None
assert result.metadata['structured_extraction'] is True
async def test_output_schema_takes_precedence_over_extraction_schema(self, browser_session, base_url):
"""When the LLM provides output_schema in params, it should take precedence over injected extraction_schema."""
tools = Tools()
await tools.navigate(url=f'{base_url}/products', new_tab=False, browser_session=browser_session)
await asyncio.sleep(0.5)
# Different schema than the injected one — just a name list
param_schema = {
'type': 'object',
'properties': {
'names': {'type': 'array', 'items': {'type': 'string'}},
},
'required': ['names'],
}
param_response = {'names': ['Widget A', 'Widget B']}
extraction_llm = _make_extraction_llm(structured_response=param_response)
with tempfile.TemporaryDirectory() as tmp:
fs = FileSystem(tmp)
result = await tools.extract(
query='List product names',
output_schema=param_schema,
browser_session=browser_session,
page_extraction_llm=extraction_llm,
file_system=fs,
extraction_schema=PRODUCT_SCHEMA, # should be ignored
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert '<structured_result>' in result.extracted_content
start = result.extracted_content.index('<structured_result>') + len('<structured_result>')
end = result.extracted_content.index('</structured_result>')
parsed = json.loads(result.extracted_content[start:end].strip())
# Should match param_schema response, NOT PRODUCT_SCHEMA
assert parsed == param_response
assert result.metadata is not None
assert result.metadata['extraction_result']['schema_used'] == param_schema
async def test_no_schema_uses_freetext_path(self, browser_session, base_url):
"""When neither output_schema nor extraction_schema is provided, free-text path is used (backward compat)."""
tools = Tools()
await tools.navigate(url=f'{base_url}/products', new_tab=False, browser_session=browser_session)
await asyncio.sleep(0.5)
extraction_llm = _make_extraction_llm(freetext_response='Widget A costs $9.99')
with tempfile.TemporaryDirectory() as tmp:
fs = FileSystem(tmp)
result = await tools.extract(
query='What products are listed?',
browser_session=browser_session,
page_extraction_llm=extraction_llm,
file_system=fs,
# No extraction_schema, no output_schema
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert '<result>' in result.extracted_content
assert '<structured_result>' not in result.extracted_content
assert result.metadata is None
async def test_extraction_schema_threads_through_act(self, browser_session, base_url):
"""extraction_schema passed to act() reaches extract() via the registry's special parameter injection."""
tools = Tools()
await tools.navigate(url=f'{base_url}/products', new_tab=False, browser_session=browser_session)
await asyncio.sleep(0.5)
extraction_llm = _make_extraction_llm(structured_response=MOCK_PRODUCTS)
with tempfile.TemporaryDirectory() as tmp:
fs = FileSystem(tmp)
# Build an ActionModel for the extract action
action_model = tools.registry.create_action_model()
action = action_model.model_validate({'extract': {'query': 'List products'}})
result = await tools.act(
action=action,
browser_session=browser_session,
page_extraction_llm=extraction_llm,
file_system=fs,
extraction_schema=PRODUCT_SCHEMA,
)
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert '<structured_result>' in result.extracted_content