mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
install all browser versions for evals and tests
This commit is contained in:
6
.github/workflows/eval.yaml
vendored
6
.github/workflows/eval.yaml
vendored
@@ -89,7 +89,11 @@ jobs:
|
||||
- name: Install Playwright browser dependencies
|
||||
run: |
|
||||
echo "Installing Playwright browsers..."
|
||||
playwright install --no-shell chromium
|
||||
# comment out some based on whether stealth=True is used for evals or not
|
||||
playwright install --no-shell chromium --with-deps
|
||||
patchright install --no-shell chromium --with-deps
|
||||
playwright install --no-shell chrome --with-deps
|
||||
patchright install --no-shell chrome --with-deps
|
||||
echo "Playwright browsers installed successfully"
|
||||
|
||||
- name: Install Xvfb for headed mode
|
||||
|
||||
6
.github/workflows/test.yaml
vendored
6
.github/workflows/test.yaml
vendored
@@ -88,6 +88,8 @@ jobs:
|
||||
|
||||
- run: playwright install chromium --with-deps
|
||||
- run: playwright install chrome --with-deps
|
||||
- run: patchright install chrome --with-deps
|
||||
- run: patchright install chromium --with-deps
|
||||
|
||||
- run: pytest tests/ci/${{ matrix.test_filename }}.py
|
||||
|
||||
@@ -107,7 +109,7 @@ jobs:
|
||||
enable-cache: true
|
||||
activate-environment: true
|
||||
|
||||
- run: uv sync --dev
|
||||
- run: uv sync --dev --all-extras
|
||||
|
||||
- name: Detect installed Playwright version
|
||||
run: echo "PLAYWRIGHT_VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_ENV
|
||||
@@ -121,6 +123,8 @@ jobs:
|
||||
|
||||
- run: playwright install chromium --with-deps
|
||||
- run: playwright install chrome --with-deps
|
||||
- run: patchright install chrome --with-deps
|
||||
- run: patchright install chromium --with-deps
|
||||
|
||||
- name: Run agent tasks evaluation and capture score
|
||||
id: eval
|
||||
|
||||
@@ -206,8 +206,8 @@ class TestHeadlessScreenshots:
|
||||
results = await asyncio.gather(*screenshot_tasks, return_exceptions=True)
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Verify timing
|
||||
assert total_time < 60, f'Screenshots took too long: {total_time:.1f}s (should be < 60s)'
|
||||
# Verify timing - maximum should be 200s (20s × 10)
|
||||
assert total_time < 200, f'Screenshots took too long: {total_time:.1f}s (should be < 200s)'
|
||||
print(f'All screenshot attempts completed in {total_time:.1f}s')
|
||||
|
||||
# Separate successful screenshots from failures
|
||||
@@ -219,45 +219,49 @@ class TestHeadlessScreenshots:
|
||||
print(f'Session {i} failed: {type(result).__name__}: {result}')
|
||||
else:
|
||||
screenshots.append(result)
|
||||
print(f'Session {i} screenshot completed successfully')
|
||||
|
||||
# We should have at least some successful screenshots
|
||||
assert len(screenshots) > 0, f'All screenshots failed! {len(failures)} failures'
|
||||
print(f'Successfully captured {len(screenshots)} out of {len(results)} screenshots')
|
||||
# ALL screenshots must succeed
|
||||
assert len(failures) == 0, (
|
||||
f'{len(failures)} screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in failures]}'
|
||||
)
|
||||
assert len(screenshots) == 10, f'Expected 10 successful screenshots, got {len(screenshots)}'
|
||||
print('✅ All 10 screenshots captured successfully!')
|
||||
|
||||
# Verify all screenshots are valid
|
||||
print('Verifying all screenshots...')
|
||||
print('Verifying all 10 screenshots...')
|
||||
for i, screenshot in enumerate(screenshots):
|
||||
# Should not be None
|
||||
assert screenshot is not None, f'Session {i} returned None screenshot'
|
||||
assert isinstance(screenshot, str), f'Session {i} screenshot is not a string'
|
||||
assert len(screenshot) > 0, f'Session {i} screenshot is empty'
|
||||
assert screenshot is not None, f'Screenshot {i} returned None'
|
||||
assert isinstance(screenshot, str), f'Screenshot {i} is not a string'
|
||||
assert len(screenshot) > 0, f'Screenshot {i} is empty'
|
||||
|
||||
# Decode and validate
|
||||
try:
|
||||
screenshot_bytes = base64.b64decode(screenshot)
|
||||
except Exception as e:
|
||||
raise AssertionError(f'Session {i} screenshot is not valid base64: {e}')
|
||||
raise AssertionError(f'Screenshot {i} is not valid base64: {e}')
|
||||
|
||||
# Verify PNG signature
|
||||
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Session {i} screenshot is not a valid PNG'
|
||||
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Screenshot {i} is not a valid PNG'
|
||||
|
||||
# Full page screenshot should be reasonably large
|
||||
# Due to our 6,000px height limit, expect at least 30KB
|
||||
assert len(screenshot_bytes) > 5000, f'Session {i} screenshot too small: {len(screenshot_bytes)} bytes'
|
||||
# Due to our 6,000px height limit, expect at least 5KB
|
||||
assert len(screenshot_bytes) > 5000, f'Screenshot {i} too small: {len(screenshot_bytes)} bytes'
|
||||
|
||||
print(f'All {len(screenshots)} screenshots validated successfully!')
|
||||
print('✅ All 10 screenshots validated successfully!')
|
||||
|
||||
# Also test taking regular (viewport) screenshots
|
||||
print('Taking viewport screenshots from all sessions...')
|
||||
print('\nTaking viewport screenshots from all sessions...')
|
||||
start_time = time.time()
|
||||
viewport_results = await asyncio.gather(
|
||||
*[session.take_screenshot() for session in browser_sessions], return_exceptions=True
|
||||
)
|
||||
viewport_time = time.time() - start_time
|
||||
assert viewport_time < 60, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 60s)'
|
||||
assert viewport_time < 200, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 200s)'
|
||||
print(f'All viewport screenshot attempts completed in {viewport_time:.1f}s')
|
||||
|
||||
# Separate successful viewport screenshots from failures
|
||||
# Check for failures
|
||||
viewport_screenshots = []
|
||||
viewport_failures = []
|
||||
for i, result in enumerate(viewport_results):
|
||||
@@ -266,17 +270,26 @@ class TestHeadlessScreenshots:
|
||||
print(f'Session {i} viewport failed: {type(result).__name__}: {result}')
|
||||
else:
|
||||
viewport_screenshots.append(result)
|
||||
print(f'Session {i} viewport screenshot completed successfully')
|
||||
|
||||
assert len(viewport_screenshots) > 0, f'All viewport screenshots failed! {len(viewport_failures)} failures'
|
||||
print(f'Successfully captured {len(viewport_screenshots)} out of {len(viewport_results)} viewport screenshots')
|
||||
# ALL viewport screenshots must succeed
|
||||
assert len(viewport_failures) == 0, (
|
||||
f'{len(viewport_failures)} viewport screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in viewport_failures]}'
|
||||
)
|
||||
assert len(viewport_screenshots) == 10, (
|
||||
f'Expected 10 successful viewport screenshots, got {len(viewport_screenshots)}'
|
||||
)
|
||||
print('✅ All 10 viewport screenshots captured successfully!')
|
||||
|
||||
# Verify viewport screenshots
|
||||
# Verify all 10 viewport screenshots
|
||||
print('Verifying all 10 viewport screenshots...')
|
||||
for i, screenshot in enumerate(viewport_screenshots):
|
||||
assert screenshot is not None, f'Viewport screenshot {i} is None'
|
||||
screenshot_bytes = base64.b64decode(screenshot)
|
||||
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n')
|
||||
# Viewport screenshots should be smaller than full page
|
||||
assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small'
|
||||
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Viewport screenshot {i} is not a valid PNG'
|
||||
# Viewport screenshots should be reasonably sized
|
||||
assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small: {len(screenshot_bytes)} bytes'
|
||||
print('✅ All 10 viewport screenshots validated successfully!')
|
||||
|
||||
finally:
|
||||
# Kill all sessions in parallel
|
||||
|
||||
Reference in New Issue
Block a user