install all browser versions for evals and tests

This commit is contained in:
Nick Sweeting
2025-07-08 06:05:20 -07:00
parent 80327bb63e
commit 4d8bdb3dbf
3 changed files with 46 additions and 25 deletions

View File

@@ -89,7 +89,11 @@ jobs:
- name: Install Playwright browser dependencies
run: |
echo "Installing Playwright browsers..."
playwright install --no-shell chromium
# comment out some based on whether stealth=True is used for evals or not
playwright install --no-shell chromium --with-deps
patchright install --no-shell chromium --with-deps
playwright install --no-shell chrome --with-deps
patchright install --no-shell chrome --with-deps
echo "Playwright browsers installed successfully"
- name: Install Xvfb for headed mode

View File

@@ -88,6 +88,8 @@ jobs:
- run: playwright install chromium --with-deps
- run: playwright install chrome --with-deps
- run: patchright install chrome --with-deps
- run: patchright install chromium --with-deps
- run: pytest tests/ci/${{ matrix.test_filename }}.py
@@ -107,7 +109,7 @@ jobs:
enable-cache: true
activate-environment: true
- run: uv sync --dev
- run: uv sync --dev --all-extras
- name: Detect installed Playwright version
run: echo "PLAYWRIGHT_VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_ENV
@@ -121,6 +123,8 @@ jobs:
- run: playwright install chromium --with-deps
- run: playwright install chrome --with-deps
- run: patchright install chrome --with-deps
- run: patchright install chromium --with-deps
- name: Run agent tasks evaluation and capture score
id: eval

View File

@@ -206,8 +206,8 @@ class TestHeadlessScreenshots:
results = await asyncio.gather(*screenshot_tasks, return_exceptions=True)
total_time = time.time() - start_time
# Verify timing
assert total_time < 60, f'Screenshots took too long: {total_time:.1f}s (should be < 60s)'
# Verify timing - maximum should be 200s (20s × 10)
assert total_time < 200, f'Screenshots took too long: {total_time:.1f}s (should be < 200s)'
print(f'All screenshot attempts completed in {total_time:.1f}s')
# Separate successful screenshots from failures
@@ -219,45 +219,49 @@ class TestHeadlessScreenshots:
print(f'Session {i} failed: {type(result).__name__}: {result}')
else:
screenshots.append(result)
print(f'Session {i} screenshot completed successfully')
# We should have at least some successful screenshots
assert len(screenshots) > 0, f'All screenshots failed! {len(failures)} failures'
print(f'Successfully captured {len(screenshots)} out of {len(results)} screenshots')
# ALL screenshots must succeed
assert len(failures) == 0, (
f'{len(failures)} screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in failures]}'
)
assert len(screenshots) == 10, f'Expected 10 successful screenshots, got {len(screenshots)}'
print('✅ All 10 screenshots captured successfully!')
# Verify all screenshots are valid
print('Verifying all screenshots...')
print('Verifying all 10 screenshots...')
for i, screenshot in enumerate(screenshots):
# Should not be None
assert screenshot is not None, f'Session {i} returned None screenshot'
assert isinstance(screenshot, str), f'Session {i} screenshot is not a string'
assert len(screenshot) > 0, f'Session {i} screenshot is empty'
assert screenshot is not None, f'Screenshot {i} returned None'
assert isinstance(screenshot, str), f'Screenshot {i} is not a string'
assert len(screenshot) > 0, f'Screenshot {i} is empty'
# Decode and validate
try:
screenshot_bytes = base64.b64decode(screenshot)
except Exception as e:
raise AssertionError(f'Session {i} screenshot is not valid base64: {e}')
raise AssertionError(f'Screenshot {i} is not valid base64: {e}')
# Verify PNG signature
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Session {i} screenshot is not a valid PNG'
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Screenshot {i} is not a valid PNG'
# Full page screenshot should be reasonably large
# Due to our 6,000px height limit, expect at least 30KB
assert len(screenshot_bytes) > 5000, f'Session {i} screenshot too small: {len(screenshot_bytes)} bytes'
# Due to our 6,000px height limit, expect at least 5KB
assert len(screenshot_bytes) > 5000, f'Screenshot {i} too small: {len(screenshot_bytes)} bytes'
print(f'All {len(screenshots)} screenshots validated successfully!')
print('All 10 screenshots validated successfully!')
# Also test taking regular (viewport) screenshots
print('Taking viewport screenshots from all sessions...')
print('\nTaking viewport screenshots from all sessions...')
start_time = time.time()
viewport_results = await asyncio.gather(
*[session.take_screenshot() for session in browser_sessions], return_exceptions=True
)
viewport_time = time.time() - start_time
assert viewport_time < 60, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 60s)'
assert viewport_time < 200, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 200s)'
print(f'All viewport screenshot attempts completed in {viewport_time:.1f}s')
# Separate successful viewport screenshots from failures
# Check for failures
viewport_screenshots = []
viewport_failures = []
for i, result in enumerate(viewport_results):
@@ -266,17 +270,26 @@ class TestHeadlessScreenshots:
print(f'Session {i} viewport failed: {type(result).__name__}: {result}')
else:
viewport_screenshots.append(result)
print(f'Session {i} viewport screenshot completed successfully')
assert len(viewport_screenshots) > 0, f'All viewport screenshots failed! {len(viewport_failures)} failures'
print(f'Successfully captured {len(viewport_screenshots)} out of {len(viewport_results)} viewport screenshots')
# ALL viewport screenshots must succeed
assert len(viewport_failures) == 0, (
f'{len(viewport_failures)} viewport screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in viewport_failures]}'
)
assert len(viewport_screenshots) == 10, (
f'Expected 10 successful viewport screenshots, got {len(viewport_screenshots)}'
)
print('✅ All 10 viewport screenshots captured successfully!')
# Verify viewport screenshots
# Verify all 10 viewport screenshots
print('Verifying all 10 viewport screenshots...')
for i, screenshot in enumerate(viewport_screenshots):
assert screenshot is not None, f'Viewport screenshot {i} is None'
screenshot_bytes = base64.b64decode(screenshot)
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n')
# Viewport screenshots should be smaller than full page
assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small'
assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Viewport screenshot {i} is not a valid PNG'
# Viewport screenshots should be reasonably sized
assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small: {len(screenshot_bytes)} bytes'
print('✅ All 10 viewport screenshots validated successfully!')
finally:
# Kill all sessions in parallel