install all browser versions for evals and tests

2026-05-06 17:52:15 +02:00 · 2025-07-08 06:05:20 -07:00
parent 80327bb63e
commit 4d8bdb3dbf
3 changed files with 46 additions and 25 deletions
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -89,7 +89,11 @@ jobs:
      - name: Install Playwright browser dependencies
        run: |
          echo "Installing Playwright browsers..."
-          playwright install --no-shell chromium
+          # comment out some based on whether stealth=True is used for evals or not
+          playwright install --no-shell chromium --with-deps
+          patchright install --no-shell chromium --with-deps
+          playwright install --no-shell chrome --with-deps
+          patchright install --no-shell chrome --with-deps
          echo "Playwright browsers installed successfully"

      - name: Install Xvfb for headed mode
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -88,6 +88,8 @@ jobs:

      - run: playwright install chromium --with-deps
      - run: playwright install chrome --with-deps 
+      - run: patchright install chrome --with-deps
+      - run: patchright install chromium --with-deps

      - run: pytest tests/ci/${{ matrix.test_filename }}.py

@@ -107,7 +109,7 @@ jobs:
          enable-cache: true
          activate-environment: true

-      - run: uv sync --dev
+      - run: uv sync --dev --all-extras

      - name: Detect installed Playwright version
        run: echo "PLAYWRIGHT_VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_ENV
@@ -121,6 +123,8 @@ jobs:

      - run: playwright install chromium --with-deps
      - run: playwright install chrome --with-deps
+      - run: patchright install chrome --with-deps
+      - run: patchright install chromium --with-deps

      - name: Run agent tasks evaluation and capture score
        id: eval
--- a/tests/ci/test_browser_session_screenshots.py
+++ b/tests/ci/test_browser_session_screenshots.py
@@ -206,8 +206,8 @@ class TestHeadlessScreenshots:
 			results = await asyncio.gather(*screenshot_tasks, return_exceptions=True)
 			total_time = time.time() - start_time

-			# Verify timing
-			assert total_time < 60, f'Screenshots took too long: {total_time:.1f}s (should be < 60s)'
+			# Verify timing - maximum should be 200s (20s × 10)
+			assert total_time < 200, f'Screenshots took too long: {total_time:.1f}s (should be < 200s)'
 			print(f'All screenshot attempts completed in {total_time:.1f}s')

 			# Separate successful screenshots from failures
@@ -219,45 +219,49 @@ class TestHeadlessScreenshots:
 					print(f'Session {i} failed: {type(result).__name__}: {result}')
 				else:
 					screenshots.append(result)
+					print(f'Session {i} screenshot completed successfully')

-			# We should have at least some successful screenshots
-			assert len(screenshots) > 0, f'All screenshots failed! {len(failures)} failures'
-			print(f'Successfully captured {len(screenshots)} out of {len(results)} screenshots')
+			# ALL screenshots must succeed
+			assert len(failures) == 0, (
+				f'{len(failures)} screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in failures]}'
+			)
+			assert len(screenshots) == 10, f'Expected 10 successful screenshots, got {len(screenshots)}'
+			print('✅ All 10 screenshots captured successfully!')

 			# Verify all screenshots are valid
-			print('Verifying all screenshots...')
+			print('Verifying all 10 screenshots...')
 			for i, screenshot in enumerate(screenshots):
 				# Should not be None
-				assert screenshot is not None, f'Session {i} returned None screenshot'
-				assert isinstance(screenshot, str), f'Session {i} screenshot is not a string'
-				assert len(screenshot) > 0, f'Session {i} screenshot is empty'
+				assert screenshot is not None, f'Screenshot {i} returned None'
+				assert isinstance(screenshot, str), f'Screenshot {i} is not a string'
+				assert len(screenshot) > 0, f'Screenshot {i} is empty'

 				# Decode and validate
 				try:
 					screenshot_bytes = base64.b64decode(screenshot)
 				except Exception as e:
-					raise AssertionError(f'Session {i} screenshot is not valid base64: {e}')
+					raise AssertionError(f'Screenshot {i} is not valid base64: {e}')

 				# Verify PNG signature
-				assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Session {i} screenshot is not a valid PNG'
+				assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Screenshot {i} is not a valid PNG'

 				# Full page screenshot should be reasonably large
-				# Due to our 6,000px height limit, expect at least 30KB
-				assert len(screenshot_bytes) > 5000, f'Session {i} screenshot too small: {len(screenshot_bytes)} bytes'
+				# Due to our 6,000px height limit, expect at least 5KB
+				assert len(screenshot_bytes) > 5000, f'Screenshot {i} too small: {len(screenshot_bytes)} bytes'

-			print(f'All {len(screenshots)} screenshots validated successfully!')
+			print('✅ All 10 screenshots validated successfully!')

 			# Also test taking regular (viewport) screenshots
-			print('Taking viewport screenshots from all sessions...')
+			print('\nTaking viewport screenshots from all sessions...')
 			start_time = time.time()
 			viewport_results = await asyncio.gather(
 				*[session.take_screenshot() for session in browser_sessions], return_exceptions=True
 			)
 			viewport_time = time.time() - start_time
-			assert viewport_time < 60, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 60s)'
+			assert viewport_time < 200, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 200s)'
 			print(f'All viewport screenshot attempts completed in {viewport_time:.1f}s')

-			# Separate successful viewport screenshots from failures
+			# Check for failures
 			viewport_screenshots = []
 			viewport_failures = []
 			for i, result in enumerate(viewport_results):
@@ -266,17 +270,26 @@ class TestHeadlessScreenshots:
 					print(f'Session {i} viewport failed: {type(result).__name__}: {result}')
 				else:
 					viewport_screenshots.append(result)
+					print(f'Session {i} viewport screenshot completed successfully')

-			assert len(viewport_screenshots) > 0, f'All viewport screenshots failed! {len(viewport_failures)} failures'
-			print(f'Successfully captured {len(viewport_screenshots)} out of {len(viewport_results)} viewport screenshots')
+			# ALL viewport screenshots must succeed
+			assert len(viewport_failures) == 0, (
+				f'{len(viewport_failures)} viewport screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in viewport_failures]}'
+			)
+			assert len(viewport_screenshots) == 10, (
+				f'Expected 10 successful viewport screenshots, got {len(viewport_screenshots)}'
+			)
+			print('✅ All 10 viewport screenshots captured successfully!')

-			# Verify viewport screenshots
+			# Verify all 10 viewport screenshots
+			print('Verifying all 10 viewport screenshots...')
 			for i, screenshot in enumerate(viewport_screenshots):
 				assert screenshot is not None, f'Viewport screenshot {i} is None'
 				screenshot_bytes = base64.b64decode(screenshot)
-				assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n')
-				# Viewport screenshots should be smaller than full page
-				assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small'
+				assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Viewport screenshot {i} is not a valid PNG'
+				# Viewport screenshots should be reasonably sized
+				assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small: {len(screenshot_bytes)} bytes'
+			print('✅ All 10 viewport screenshots validated successfully!')

 		finally:
 			# Kill all sessions in parallel