browser-use/tests/scripts/debug_iframe_scrolling.py

"""
Debug test for iframe scrolling issue where DOM tree only shows top elements after scrolling.

This test verifies that after scrolling inside an iframe, the selector_map correctly
contains lower input elements like City, State, Zip Code, etc.
"""

import asyncio
import sys
from pathlib import Path

# Add parent directory to path to import browser_use modules
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from browser_use.agent.service import Agent
from browser_use.agent.views import ActionModel, AgentOutput
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.browser.events import BrowserStateRequestEvent
from browser_use.controller.service import Controller
from browser_use.controller.views import GoToUrlAction, ScrollAction

# Import the mock LLM helper from conftest
from tests.ci.conftest import create_mock_llm


async def debug_iframe_scrolling():
	"""Debug iframe scrolling and DOM visibility issue."""

	print("Starting iframe scrolling debug test...")

	# Create the sequence of actions for the mock LLM
	# We need to format these as the LLM would return them
	actions = [
		# First action: Navigate to the test URL
		"""
		{
			"thinking": "Navigating to the iframe test page",
			"evaluation_previous_goal": null,
			"memory": "Starting test",
			"next_goal": "Navigate to the iframe test page",
			"action": [
				{
					"go_to_url": {
						"url": "https://browser-use.github.io/stress-tests/challenges/iframe-inception-level1.html",
						"new_tab": false
					}
				}
			]
		}
		""",
		# Second action: Input text in the first name field (to verify we can interact)
		"""
		{
			"thinking": "Inputting text in the first name field to test interaction",
			"evaluation_previous_goal": "Successfully navigated to the page",
			"memory": "Page loaded with nested iframes",
			"next_goal": "Type text in the first name field",
			"action": [
				{
					"input_text": {
						"index": 1,
						"text": "TestName"
					}
				}
			]
		}
		""",
		# Third action: Scroll the iframe (element_index=2 should be the iframe)
		"""
		{
			"thinking": "Scrolling inside the iframe to reveal lower form elements",
			"evaluation_previous_goal": "Successfully typed in first name field",
			"memory": "Typed TestName in first field",
			"next_goal": "Scroll inside the innermost iframe to see more form fields",
			"action": [
				{
					"scroll": {
						"down": true,
						"num_pages": 1.0,
						"index": 2
					}
				}
			]
		}
		""",
		# Fourth action: Done
		"""
		{
			"thinking": "Completed scrolling, ready to inspect DOM",
			"evaluation_previous_goal": "Successfully scrolled inside iframe",
			"memory": "Scrolled to reveal lower form fields",
			"next_goal": "Task completed",
			"action": [
				{
					"done": {
						"text": "Scrolling completed",
						"success": true
					}
				}
			]
		}
		"""
	]

	# Create mock LLM with our action sequence
	mock_llm = create_mock_llm(actions=actions)

	# Create browser session with headless=False so we can see what's happening
	browser_session = BrowserSession(
		browser_profile=BrowserProfile(
			headless=False,  # Set to False to see the browser
			user_data_dir=None,  # Use temporary directory
			keep_alive=True,
			enable_default_extensions=True,
			cross_origin_iframes=True,  # Enable cross-origin iframe support
		)
	)

	try:
		# Start the browser session
		await browser_session.start()
		print("Browser session started")

		# Create an agent with the mock LLM
		agent = Agent(
			task="Navigate to the iframe test page and scroll inside the iframe",
			llm=mock_llm,
			browser_session=browser_session,
		)

		# Helper function to capture and analyze DOM state
		async def capture_dom_state(label: str) -> dict:
			"""Capture DOM state and return analysis"""
			print(f"\n📸 Capturing DOM state: {label}")
			state_event = browser_session.event_bus.dispatch(
				BrowserStateRequestEvent(
					include_dom=True,
					include_screenshot=False,
					cache_clickable_elements_hashes=True,
					include_recent_events=False
				)
			)
			browser_state = await state_event.event_result()

			if browser_state and browser_state.dom_state and browser_state.dom_state.selector_map:
				selector_map = browser_state.dom_state.selector_map
				element_count = len(selector_map)

				# Check for specific elements
				found_elements = {}
				expected_checks = [
					("First Name", ["firstName", "first name"]),
					("Last Name", ["lastName", "last name"]),
					("Email", ["email"]),
					("City", ["city"]),
					("State", ["state"]),
					("Zip", ["zip", "zipCode"]),
				]

				for name, keywords in expected_checks:
					for index, element in selector_map.items():
						element_str = str(element).lower()
						if any(kw.lower() in element_str for kw in keywords):
							found_elements[name] = True
							break

				return {
					"label": label,
					"total_elements": element_count,
					"found_elements": found_elements,
					"selector_map": selector_map
				}
			return {"label": label, "error": "No DOM state available"}

		# Capture initial state before any actions
		print("\n" + "="*80)
		print("PHASE 1: INITIAL PAGE LOAD")
		print("="*80)

		# Navigate to the page first
		from browser_use.controller.views import GoToUrlAction
		from browser_use.controller.service import Controller
		controller = Controller()

		# Create the action model for navigation
		goto_action = ActionModel.model_validate_json(actions[0])
		await controller.act(goto_action, browser_session)
		await asyncio.sleep(2)  # Wait for page to fully load

		initial_state = await capture_dom_state("INITIAL (after page load)")

		# Now run the rest of the actions via the agent
		print("\n" + "="*80)
		print("PHASE 2: EXECUTING ACTIONS")
		print("="*80)

		# Create new agent with remaining actions
		remaining_actions = actions[1:]  # Skip the navigation we already did
		mock_llm_remaining = create_mock_llm(actions=remaining_actions)
		agent = Agent(
			task="Input text and scroll inside the iframe",
			llm=mock_llm_remaining,
			browser_session=browser_session,
		)

		# Hook into agent actions to capture state after each one
		states = []
		original_act = controller.act
		async def wrapped_act(action, session):
			result = await original_act(action, session)
			# Capture state after each action
			action_type = "unknown"
			if hasattr(action, 'input_text') and action.input_text:
				action_type = "input_text"
				await asyncio.sleep(1)  # Give time for DOM to update
				state = await capture_dom_state("AFTER INPUT_TEXT")
				states.append(state)
			elif hasattr(action, 'scroll') and action.scroll:
				action_type = "scroll"
				await asyncio.sleep(2)  # Give more time after scroll
				state = await capture_dom_state("AFTER SCROLL")
				states.append(state)
			return result

		controller.act = wrapped_act

		# Run the agent with remaining actions
		result = await agent.run()
		print(f"\nAgent completed with result: {result}")

		# Analyze all captured states
		print("\n" + "="*80)
		print("PHASE 3: ANALYSIS OF DOM STATES")
		print("="*80)

		all_states = [initial_state] + states

		for state in all_states:
			if "error" in state:
				print(f"\n❌ {state['label']}: {state['error']}")
			else:
				print(f"\n📊 {state['label']}:")
				print(f"  Total elements: {state['total_elements']}")
				print(f"  Found elements:")
				for elem_name, found in state['found_elements'].items():
					status = "✓" if found else "✗"
					print(f"    {status} {elem_name}")

		# Compare states
		print("\n" + "="*80)
		print("COMPARISON SUMMARY")
		print("="*80)

		if len(all_states) >= 3:
			initial = all_states[0]
			after_input = all_states[1] if len(all_states) > 1 else None
			after_scroll = all_states[2] if len(all_states) > 2 else None

			print(f"\nElement count changes:")
			print(f"  Initial: {initial.get('total_elements', 0)} elements")
			if after_input:
				print(f"  After input_text: {after_input.get('total_elements', 0)} elements")
			if after_scroll:
				print(f"  After scroll: {after_scroll.get('total_elements', 0)} elements")

			# Check if lower form fields appear after scroll
			if after_scroll and "found_elements" in after_scroll:
				lower_fields = ["City", "State", "Zip"]
				missing_fields = [f for f in lower_fields if not after_scroll["found_elements"].get(f, False)]

				if missing_fields:
					print(f"\n⚠️  BUG CONFIRMED: Lower form fields missing after scroll:")
					for field in missing_fields:
						print(f"    ✗ {field}")
					print("\nThis confirms that scrolling inside iframes does not update the DOM tree properly.")
				else:
					print("\n✅ SUCCESS: All lower form fields are visible after scrolling!")

		# Show first few elements from final state for debugging
		if states and "selector_map" in states[-1]:
			print("\n" + "="*80)
			print("DEBUG: First 5 elements in final selector_map")
			print("="*80)
			final_map = states[-1]["selector_map"]
			for i, (index, element) in enumerate(list(final_map.items())[:5]):
				elem_preview = str(element)[:150]
				print(f"\n  [{index}]: {elem_preview}...")

		# Keep browser open for manual inspection if needed
		print("\n" + "="*80)
		print("Test complete. Browser will remain open for 10 seconds for inspection...")
		print("="*80)
		await asyncio.sleep(10)

	finally:
		# Clean up
		print("\nCleaning up...")
		await browser_session.kill()
		await browser_session.event_bus.stop(clear=True, timeout=5)
		print("Browser session closed")


if __name__ == "__main__":
	# Run the debug test
	asyncio.run(debug_iframe_scrolling())