diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml index 9f976b433..94817fce0 100644 --- a/.github/workflows/eval.yaml +++ b/.github/workflows/eval.yaml @@ -89,7 +89,11 @@ jobs: - name: Install Playwright browser dependencies run: | echo "Installing Playwright browsers..." - playwright install --no-shell chromium + # comment out some based on whether stealth=True is used for evals or not + playwright install --no-shell chromium --with-deps + patchright install --no-shell chromium --with-deps + playwright install --no-shell chrome --with-deps + patchright install --no-shell chrome --with-deps echo "Playwright browsers installed successfully" - name: Install Xvfb for headed mode diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 385c2c290..dad515518 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -88,6 +88,8 @@ jobs: - run: playwright install chromium --with-deps - run: playwright install chrome --with-deps + - run: patchright install chrome --with-deps + - run: patchright install chromium --with-deps - run: pytest tests/ci/${{ matrix.test_filename }}.py @@ -107,7 +109,7 @@ jobs: enable-cache: true activate-environment: true - - run: uv sync --dev + - run: uv sync --dev --all-extras - name: Detect installed Playwright version run: echo "PLAYWRIGHT_VERSION=$(uv pip list --format json | jq -r '.[] | select(.name == "playwright") | .version')" >> $GITHUB_ENV @@ -121,6 +123,8 @@ jobs: - run: playwright install chromium --with-deps - run: playwright install chrome --with-deps + - run: patchright install chrome --with-deps + - run: patchright install chromium --with-deps - name: Run agent tasks evaluation and capture score id: eval diff --git a/tests/ci/test_browser_session_screenshots.py b/tests/ci/test_browser_session_screenshots.py index d74cbbcee..9e25b2398 100644 --- a/tests/ci/test_browser_session_screenshots.py +++ b/tests/ci/test_browser_session_screenshots.py @@ -206,8 +206,8 @@ class TestHeadlessScreenshots: results = await asyncio.gather(*screenshot_tasks, return_exceptions=True) total_time = time.time() - start_time - # Verify timing - assert total_time < 60, f'Screenshots took too long: {total_time:.1f}s (should be < 60s)' + # Verify timing - maximum should be 200s (20s × 10) + assert total_time < 200, f'Screenshots took too long: {total_time:.1f}s (should be < 200s)' print(f'All screenshot attempts completed in {total_time:.1f}s') # Separate successful screenshots from failures @@ -219,45 +219,49 @@ class TestHeadlessScreenshots: print(f'Session {i} failed: {type(result).__name__}: {result}') else: screenshots.append(result) + print(f'Session {i} screenshot completed successfully') - # We should have at least some successful screenshots - assert len(screenshots) > 0, f'All screenshots failed! {len(failures)} failures' - print(f'Successfully captured {len(screenshots)} out of {len(results)} screenshots') + # ALL screenshots must succeed + assert len(failures) == 0, ( + f'{len(failures)} screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in failures]}' + ) + assert len(screenshots) == 10, f'Expected 10 successful screenshots, got {len(screenshots)}' + print('✅ All 10 screenshots captured successfully!') # Verify all screenshots are valid - print('Verifying all screenshots...') + print('Verifying all 10 screenshots...') for i, screenshot in enumerate(screenshots): # Should not be None - assert screenshot is not None, f'Session {i} returned None screenshot' - assert isinstance(screenshot, str), f'Session {i} screenshot is not a string' - assert len(screenshot) > 0, f'Session {i} screenshot is empty' + assert screenshot is not None, f'Screenshot {i} returned None' + assert isinstance(screenshot, str), f'Screenshot {i} is not a string' + assert len(screenshot) > 0, f'Screenshot {i} is empty' # Decode and validate try: screenshot_bytes = base64.b64decode(screenshot) except Exception as e: - raise AssertionError(f'Session {i} screenshot is not valid base64: {e}') + raise AssertionError(f'Screenshot {i} is not valid base64: {e}') # Verify PNG signature - assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Session {i} screenshot is not a valid PNG' + assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Screenshot {i} is not a valid PNG' # Full page screenshot should be reasonably large - # Due to our 6,000px height limit, expect at least 30KB - assert len(screenshot_bytes) > 5000, f'Session {i} screenshot too small: {len(screenshot_bytes)} bytes' + # Due to our 6,000px height limit, expect at least 5KB + assert len(screenshot_bytes) > 5000, f'Screenshot {i} too small: {len(screenshot_bytes)} bytes' - print(f'All {len(screenshots)} screenshots validated successfully!') + print('✅ All 10 screenshots validated successfully!') # Also test taking regular (viewport) screenshots - print('Taking viewport screenshots from all sessions...') + print('\nTaking viewport screenshots from all sessions...') start_time = time.time() viewport_results = await asyncio.gather( *[session.take_screenshot() for session in browser_sessions], return_exceptions=True ) viewport_time = time.time() - start_time - assert viewport_time < 60, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 60s)' + assert viewport_time < 200, f'Viewport screenshots took too long: {viewport_time:.1f}s (should be < 200s)' print(f'All viewport screenshot attempts completed in {viewport_time:.1f}s') - # Separate successful viewport screenshots from failures + # Check for failures viewport_screenshots = [] viewport_failures = [] for i, result in enumerate(viewport_results): @@ -266,17 +270,26 @@ class TestHeadlessScreenshots: print(f'Session {i} viewport failed: {type(result).__name__}: {result}') else: viewport_screenshots.append(result) + print(f'Session {i} viewport screenshot completed successfully') - assert len(viewport_screenshots) > 0, f'All viewport screenshots failed! {len(viewport_failures)} failures' - print(f'Successfully captured {len(viewport_screenshots)} out of {len(viewport_results)} viewport screenshots') + # ALL viewport screenshots must succeed + assert len(viewport_failures) == 0, ( + f'{len(viewport_failures)} viewport screenshots failed: {[(i, type(e).__name__, str(e)) for i, e in viewport_failures]}' + ) + assert len(viewport_screenshots) == 10, ( + f'Expected 10 successful viewport screenshots, got {len(viewport_screenshots)}' + ) + print('✅ All 10 viewport screenshots captured successfully!') - # Verify viewport screenshots + # Verify all 10 viewport screenshots + print('Verifying all 10 viewport screenshots...') for i, screenshot in enumerate(viewport_screenshots): assert screenshot is not None, f'Viewport screenshot {i} is None' screenshot_bytes = base64.b64decode(screenshot) - assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n') - # Viewport screenshots should be smaller than full page - assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small' + assert screenshot_bytes.startswith(b'\x89PNG\r\n\x1a\n'), f'Viewport screenshot {i} is not a valid PNG' + # Viewport screenshots should be reasonably sized + assert len(screenshot_bytes) > 5000, f'Viewport screenshot {i} too small: {len(screenshot_bytes)} bytes' + print('✅ All 10 viewport screenshots validated successfully!') finally: # Kill all sessions in parallel