browser-use/browser_use/dom/service.py

import asyncio
import logging
import time
from typing import TYPE_CHECKING, Any

from cdp_use.cdp.accessibility.commands import GetFullAXTreeReturns
from cdp_use.cdp.accessibility.types import AXNode
from cdp_use.cdp.dom.types import Node
from cdp_use.cdp.target import TargetID

from browser_use.dom.enhanced_snapshot import (
	REQUIRED_COMPUTED_STYLES,
	build_snapshot_lookup,
)
from browser_use.dom.serializer.clickable_elements import ClickableElementDetector
from browser_use.dom.serializer.serializer import DOMTreeSerializer
from browser_use.dom.views import (
	DOMRect,
	EnhancedAXNode,
	EnhancedAXProperty,
	EnhancedDOMTreeNode,
	NodeType,
	SerializedDOMState,
	TargetAllTrees,
)
from browser_use.observability import observe_debug
from browser_use.utils import create_task_with_error_handling

if TYPE_CHECKING:
	from browser_use.browser.session import BrowserSession

# Note: iframe limits are now configurable via BrowserProfile.max_iframes and BrowserProfile.max_iframe_depth


class DomService:
	"""
	Service for getting the DOM tree and other DOM-related information.

	Either browser or page must be provided.

	TODO: currently we start a new websocket connection PER STEP, we should definitely keep this persistent
	"""

	logger: logging.Logger

	def __init__(
		self,
		browser_session: 'BrowserSession',
		logger: logging.Logger | None = None,
		cross_origin_iframes: bool = False,
		paint_order_filtering: bool = True,
		max_iframes: int = 100,
		max_iframe_depth: int = 5,
		viewport_threshold: int | None = 1000,
	):
		self.browser_session = browser_session
		self.logger = logger or browser_session.logger
		self.cross_origin_iframes = cross_origin_iframes
		self.paint_order_filtering = paint_order_filtering
		self.max_iframes = max_iframes
		self.max_iframe_depth = max_iframe_depth
		self.viewport_threshold = viewport_threshold

	async def __aenter__(self):
		return self

	async def __aexit__(self, exc_type, exc_value, traceback):
		pass  # no need to cleanup anything, browser_session auto handles cleaning up session cache

	def _count_hidden_elements_in_iframes(self, node: EnhancedDOMTreeNode) -> None:
		"""Collect hidden interactive elements in iframes for LLM hints.

		For each iframe, collects details of hidden interactive elements including
		tag, text/name, and scroll distance in pages so the agent knows how far to scroll.
		"""

		def is_hidden_by_threshold(element: EnhancedDOMTreeNode) -> bool:
			"""Check if element is hidden by viewport threshold (not CSS)."""
			if element.is_visible or not element.snapshot_node or not element.snapshot_node.bounds:
				return False

			computed_styles = element.snapshot_node.computed_styles or {}
			display = computed_styles.get('display', '').lower()
			visibility = computed_styles.get('visibility', '').lower()
			opacity = computed_styles.get('opacity', '1')

			css_hidden = display == 'none' or visibility == 'hidden'
			try:
				css_hidden = css_hidden or float(opacity) <= 0
			except (ValueError, TypeError):
				pass

			return not css_hidden

		def collect_hidden_elements(subtree_root: EnhancedDOMTreeNode, viewport_height: float) -> list[dict[str, Any]]:
			"""Collect hidden interactive elements from subtree."""
			hidden: list[dict[str, Any]] = []

			if subtree_root.node_type == NodeType.ELEMENT_NODE:
				is_interactive = ClickableElementDetector.is_interactive(subtree_root)

				if is_interactive and is_hidden_by_threshold(subtree_root):
					# Get element text/name
					text = ''
					if subtree_root.ax_node and subtree_root.ax_node.name:
						text = subtree_root.ax_node.name[:40]
					elif subtree_root.attributes:
						text = (
							subtree_root.attributes.get('placeholder', '')
							or subtree_root.attributes.get('title', '')
							or subtree_root.attributes.get('aria-label', '')
						)[:40]

					# Get y position and convert to pages
					y_pos = 0.0
					if subtree_root.snapshot_node and subtree_root.snapshot_node.bounds:
						y_pos = subtree_root.snapshot_node.bounds.y
					pages_down = round(y_pos / viewport_height, 1) if viewport_height > 0 else 0

					hidden.append(
						{
							'tag': subtree_root.tag_name or '?',
							'text': text or '(no label)',
							'pages': pages_down,
						}
					)

			for child in subtree_root.children_nodes or []:
				hidden.extend(collect_hidden_elements(child, viewport_height))

			for shadow_root in subtree_root.shadow_roots or []:
				hidden.extend(collect_hidden_elements(shadow_root, viewport_height))

			return hidden

		def has_any_hidden_content(subtree_root: EnhancedDOMTreeNode) -> bool:
			"""Check if there's any hidden content (interactive or not) in subtree."""
			if is_hidden_by_threshold(subtree_root):
				return True

			for child in subtree_root.children_nodes or []:
				if has_any_hidden_content(child):
					return True

			for shadow_root in subtree_root.shadow_roots or []:
				if has_any_hidden_content(shadow_root):
					return True

			return False

		def process_node(current_node: EnhancedDOMTreeNode) -> None:
			"""Process node and descendants, collecting hidden elements for iframes."""
			if (
				current_node.node_type == NodeType.ELEMENT_NODE
				and current_node.tag_name
				and current_node.tag_name.upper() in ('IFRAME', 'FRAME')
				and current_node.content_document
			):
				# Get viewport height from iframe's client rect
				viewport_height = 0.0
				if current_node.snapshot_node and current_node.snapshot_node.clientRects:
					viewport_height = current_node.snapshot_node.clientRects.height

				hidden = collect_hidden_elements(current_node.content_document, viewport_height)
				# Sort by pages and limit to avoid bloating context
				hidden.sort(key=lambda x: x['pages'])
				current_node.hidden_elements_info = hidden[:10]  # Limit to 10

				# Check for hidden non-interactive content when no interactive elements found
				if not hidden and has_any_hidden_content(current_node.content_document):
					current_node.has_hidden_content = True

			for child in current_node.children_nodes or []:
				process_node(child)

			if current_node.content_document:
				process_node(current_node.content_document)

			for shadow_root in current_node.shadow_roots or []:
				process_node(shadow_root)

		process_node(node)

	def _build_enhanced_ax_node(self, ax_node: AXNode) -> EnhancedAXNode:
		properties: list[EnhancedAXProperty] | None = None
		if 'properties' in ax_node and ax_node['properties']:
			properties = []
			for property in ax_node['properties']:
				try:
					# test whether property name can go into the enum (sometimes Chrome returns some random properties)
					properties.append(
						EnhancedAXProperty(
							name=property['name'],
							value=property.get('value', {}).get('value', None),
							# related_nodes=[],  # TODO: add related nodes
						)
					)
				except ValueError:
					pass

		enhanced_ax_node = EnhancedAXNode(
			ax_node_id=ax_node['nodeId'],
			ignored=ax_node['ignored'],
			role=ax_node.get('role', {}).get('value', None),
			name=ax_node.get('name', {}).get('value', None),
			description=ax_node.get('description', {}).get('value', None),
			properties=properties,
			child_ids=ax_node.get('childIds', []) if ax_node.get('childIds') else None,
		)
		return enhanced_ax_node

	async def _get_viewport_ratio(self, target_id: TargetID) -> float:
		"""Get viewport dimensions, device pixel ratio, and scroll position using CDP."""
		cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id, focus=False)

		try:
			# Get the layout metrics which includes the visual viewport
			metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)

			visual_viewport = metrics.get('visualViewport', {})

			# IMPORTANT: Use CSS viewport instead of device pixel viewport
			# This fixes the coordinate mismatch on high-DPI displays
			css_visual_viewport = metrics.get('cssVisualViewport', {})
			css_layout_viewport = metrics.get('cssLayoutViewport', {})

			# Use CSS pixels (what JavaScript sees) instead of device pixels
			width = css_visual_viewport.get('clientWidth', css_layout_viewport.get('clientWidth', 1920.0))

			# Calculate device pixel ratio
			device_width = visual_viewport.get('clientWidth', width)
			css_width = css_visual_viewport.get('clientWidth', width)
			device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0

			return float(device_pixel_ratio)
		except Exception as e:
			self.logger.debug(f'Viewport size detection failed: {e}')
			# Fallback to default viewport size
			return 1.0

	@classmethod
	def is_element_visible_according_to_all_parents(
		cls, node: EnhancedDOMTreeNode, html_frames: list[EnhancedDOMTreeNode], viewport_threshold: int | None = 1000
	) -> bool:
		"""Check if the element is visible according to all its parent HTML frames.

		Args:
			node: The DOM node to check visibility for
			html_frames: List of parent HTML frame nodes
			viewport_threshold: Pixel threshold beyond viewport to consider visible.
				Default 1000px. Set to None to disable threshold checking entirely.
		"""

		if not node.snapshot_node:
			return False

		computed_styles = node.snapshot_node.computed_styles or {}

		display = computed_styles.get('display', '').lower()
		visibility = computed_styles.get('visibility', '').lower()
		opacity = computed_styles.get('opacity', '1')

		if display == 'none' or visibility == 'hidden':
			return False

		try:
			if float(opacity) <= 0:
				return False
		except (ValueError, TypeError):
			pass

		# Start with the element's local bounds (in its own frame's coordinate system)
		current_bounds = node.snapshot_node.bounds

		if not current_bounds:
			return False  # If there are no bounds, the element is not visible

		# If threshold is None, skip all viewport-based filtering (only check CSS visibility)
		if viewport_threshold is None:
			return True

		"""
		Reverse iterate through the html frames (that can be either iframe or document -> if it's a document frame compare if the current bounds interest with it (taking scroll into account) otherwise move the current bounds by the iframe offset)
		"""
		for frame in reversed(html_frames):
			if (
				frame.node_type == NodeType.ELEMENT_NODE
				and (frame.node_name.upper() == 'IFRAME' or frame.node_name.upper() == 'FRAME')
				and frame.snapshot_node
				and frame.snapshot_node.bounds
			):
				iframe_bounds = frame.snapshot_node.bounds

				# negate the values added in `_construct_enhanced_node`
				current_bounds.x += iframe_bounds.x
				current_bounds.y += iframe_bounds.y

			if (
				frame.node_type == NodeType.ELEMENT_NODE
				and frame.node_name == 'HTML'
				and frame.snapshot_node
				and frame.snapshot_node.scrollRects
				and frame.snapshot_node.clientRects
			):
				# For iframe content, we need to check visibility within the iframe's viewport
				# The scrollRects represent the current scroll position
				# The clientRects represent the viewport size
				# Elements are visible if they fall within the viewport after accounting for scroll

				# The viewport of the frame (what's actually visible)
				viewport_left = 0  # Viewport always starts at 0 in frame coordinates
				viewport_top = 0
				viewport_right = frame.snapshot_node.clientRects.width
				viewport_bottom = frame.snapshot_node.clientRects.height

				# Adjust element bounds by the scroll offset to get position relative to viewport
				# When scrolled down, scrollRects.y is positive, so we subtract it from element's y
				adjusted_x = current_bounds.x - frame.snapshot_node.scrollRects.x
				adjusted_y = current_bounds.y - frame.snapshot_node.scrollRects.y

				frame_intersects = (
					adjusted_x < viewport_right
					and adjusted_x + current_bounds.width > viewport_left
					and adjusted_y < viewport_bottom + viewport_threshold
					and adjusted_y + current_bounds.height > viewport_top - viewport_threshold
				)

				if not frame_intersects:
					return False

				# Keep the original coordinate adjustment to maintain consistency
				# This adjustment is needed for proper coordinate transformation
				current_bounds.x -= frame.snapshot_node.scrollRects.x
				current_bounds.y -= frame.snapshot_node.scrollRects.y

		# If we reach here, element is visible in main viewport and all containing iframes
		return True

	async def _get_ax_tree_for_all_frames(self, target_id: TargetID) -> GetFullAXTreeReturns:
		"""Recursively collect all frames and merge their accessibility trees into a single array."""

		cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id, focus=False)
		frame_tree = await cdp_session.cdp_client.send.Page.getFrameTree(session_id=cdp_session.session_id)

		def collect_all_frame_ids(frame_tree_node) -> list[str]:
			"""Recursively collect all frame IDs from the frame tree."""
			frame_ids = [frame_tree_node['frame']['id']]

			if 'childFrames' in frame_tree_node and frame_tree_node['childFrames']:
				for child_frame in frame_tree_node['childFrames']:
					frame_ids.extend(collect_all_frame_ids(child_frame))

			return frame_ids

		# Collect all frame IDs recursively
		all_frame_ids = collect_all_frame_ids(frame_tree['frameTree'])

		# Get accessibility tree for each frame
		ax_tree_requests = []
		for frame_id in all_frame_ids:
			ax_tree_request = cdp_session.cdp_client.send.Accessibility.getFullAXTree(
				params={'frameId': frame_id}, session_id=cdp_session.session_id
			)
			ax_tree_requests.append(ax_tree_request)

		# Wait for all requests to complete
		ax_trees = await asyncio.gather(*ax_tree_requests)

		# Merge all AX nodes into a single array
		merged_nodes: list[AXNode] = []
		for ax_tree in ax_trees:
			merged_nodes.extend(ax_tree['nodes'])

		return {'nodes': merged_nodes}

	async def _get_all_trees(self, target_id: TargetID) -> TargetAllTrees:
		cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id, focus=False)

		# Wait for the page to be ready first
		try:
			ready_state = await cdp_session.cdp_client.send.Runtime.evaluate(
				params={'expression': 'document.readyState'}, session_id=cdp_session.session_id
			)
		except Exception as e:
			pass  # Page might not be ready yet
		# DEBUG: Log before capturing snapshot
		self.logger.debug(f'🔍 DEBUG: Capturing DOM snapshot for target {target_id}')

		# Get actual scroll positions for all iframes before capturing snapshot
		start_iframe_scroll = time.time()
		iframe_scroll_positions = {}
		try:
			scroll_result = await cdp_session.cdp_client.send.Runtime.evaluate(
				params={
					'expression': """
					(() => {
						const scrollData = {};
						const iframes = document.querySelectorAll('iframe');
						iframes.forEach((iframe, index) => {
							try {
								const doc = iframe.contentDocument || iframe.contentWindow.document;
								if (doc) {
									scrollData[index] = {
										scrollTop: doc.documentElement.scrollTop || doc.body.scrollTop || 0,
										scrollLeft: doc.documentElement.scrollLeft || doc.body.scrollLeft || 0
									};
								}
							} catch (e) {
								// Cross-origin iframe, can't access
							}
						});
						return scrollData;
					})()
					""",
					'returnByValue': True,
				},
				session_id=cdp_session.session_id,
			)
			if scroll_result and 'result' in scroll_result and 'value' in scroll_result['result']:
				iframe_scroll_positions = scroll_result['result']['value']
				for idx, scroll_data in iframe_scroll_positions.items():
					self.logger.debug(
						f'🔍 DEBUG: Iframe {idx} actual scroll position - scrollTop={scroll_data.get("scrollTop", 0)}, scrollLeft={scroll_data.get("scrollLeft", 0)}'
					)
		except Exception as e:
			self.logger.debug(f'Failed to get iframe scroll positions: {e}')
		iframe_scroll_ms = (time.time() - start_iframe_scroll) * 1000

		# Detect elements with JavaScript click event listeners (without mutating DOM)
		# On heavy pages (>10k elements) the querySelectorAll('*') + getEventListeners()
		# loop plus per-element DOM.describeNode CDP calls can take 10s+.
		# The JS expression below bails out early if the page is too heavy.
		# Elements are still detected via the accessibility tree and ClickableElementDetector.
		start_js_listener_detection = time.time()
		js_click_listener_backend_ids: set[int] = set()
		try:
			# Step 1: Run JS to find elements with click listeners and return them by reference
			js_listener_result = await cdp_session.cdp_client.send.Runtime.evaluate(
				params={
					'expression': """
					(() => {
						// getEventListeners is only available in DevTools context via includeCommandLineAPI
						if (typeof getEventListeners !== 'function') {
							return null;
						}

						const allElements = document.querySelectorAll('*');

						// Skip on heavy pages — listener detection is too expensive
						if (allElements.length > 10000) {
							return null;
						}

						const elementsWithListeners = [];

						for (const el of allElements) {
							try {
								const listeners = getEventListeners(el);
								// Check for click-related event listeners
								if (listeners.click || listeners.mousedown || listeners.mouseup || listeners.pointerdown || listeners.pointerup) {
									elementsWithListeners.push(el);
								}
							} catch (e) {
								// Ignore errors for individual elements (e.g., cross-origin)
							}
						}

						return elementsWithListeners;
					})()
					""",
					'includeCommandLineAPI': True,  # enables getEventListeners()
					'returnByValue': False,  # Return object references, not values
				},
				session_id=cdp_session.session_id,
			)

			result_object_id = js_listener_result.get('result', {}).get('objectId')
			if result_object_id:
				# Step 2: Get array properties to access each element
				array_props = await cdp_session.cdp_client.send.Runtime.getProperties(
					params={
						'objectId': result_object_id,
						'ownProperties': True,
					},
					session_id=cdp_session.session_id,
				)

				# Step 3: For each element, get its backend node ID via DOM.describeNode
				element_object_ids: list[str] = []
				for prop in array_props.get('result', []):
					# Array indices are numeric property names
					prop_name = prop.get('name', '') if isinstance(prop, dict) else ''
					if isinstance(prop_name, str) and prop_name.isdigit():
						prop_value = prop.get('value', {}) if isinstance(prop, dict) else {}
						if isinstance(prop_value, dict):
							object_id = prop_value.get('objectId')
							if object_id and isinstance(object_id, str):
								element_object_ids.append(object_id)

				# Batch resolve backend node IDs (run in parallel)
				async def get_backend_node_id(object_id: str) -> int | None:
					try:
						node_info = await cdp_session.cdp_client.send.DOM.describeNode(
							params={'objectId': object_id},
							session_id=cdp_session.session_id,
						)
						return node_info.get('node', {}).get('backendNodeId')
					except Exception:
						return None

				# Resolve all element object IDs to backend node IDs in parallel
				backend_ids = await asyncio.gather(*[get_backend_node_id(oid) for oid in element_object_ids])
				js_click_listener_backend_ids = {bid for bid in backend_ids if bid is not None}

				# Release the array object to avoid memory leaks
				try:
					await cdp_session.cdp_client.send.Runtime.releaseObject(
						params={'objectId': result_object_id},
						session_id=cdp_session.session_id,
					)
				except Exception:
					pass  # Best effort cleanup

				self.logger.debug(f'Detected {len(js_click_listener_backend_ids)} elements with JS click listeners')
		except Exception as e:
			self.logger.debug(f'Failed to detect JS event listeners: {e}')
		js_listener_detection_ms = (time.time() - start_js_listener_detection) * 1000

		# Define CDP request factories to avoid duplication
		def create_snapshot_request():
			return cdp_session.cdp_client.send.DOMSnapshot.captureSnapshot(
				params={
					'computedStyles': REQUIRED_COMPUTED_STYLES,
					'includePaintOrder': True,
					'includeDOMRects': True,
					'includeBlendedBackgroundColors': False,
					'includeTextColorOpacities': False,
				},
				session_id=cdp_session.session_id,
			)

		def create_dom_tree_request():
			return cdp_session.cdp_client.send.DOM.getDocument(
				params={'depth': -1, 'pierce': True}, session_id=cdp_session.session_id
			)

		start_cdp_calls = time.time()

		# Create initial tasks
		tasks = {
			'snapshot': create_task_with_error_handling(create_snapshot_request(), name='get_snapshot'),
			'dom_tree': create_task_with_error_handling(create_dom_tree_request(), name='get_dom_tree'),
			'ax_tree': create_task_with_error_handling(self._get_ax_tree_for_all_frames(target_id), name='get_ax_tree'),
			'device_pixel_ratio': create_task_with_error_handling(self._get_viewport_ratio(target_id), name='get_viewport_ratio'),
		}

		# Wait for all tasks with timeout
		done, pending = await asyncio.wait(tasks.values(), timeout=10.0)

		# Retry any failed or timed out tasks
		if pending:
			for task in pending:
				task.cancel()

			# Retry mapping for pending tasks
			retry_map = {
				tasks['snapshot']: lambda: create_task_with_error_handling(create_snapshot_request(), name='get_snapshot_retry'),
				tasks['dom_tree']: lambda: create_task_with_error_handling(create_dom_tree_request(), name='get_dom_tree_retry'),
				tasks['ax_tree']: lambda: create_task_with_error_handling(
					self._get_ax_tree_for_all_frames(target_id), name='get_ax_tree_retry'
				),
				tasks['device_pixel_ratio']: lambda: create_task_with_error_handling(
					self._get_viewport_ratio(target_id), name='get_viewport_ratio_retry'
				),
			}

			# Create new tasks only for the ones that didn't complete
			for key, task in tasks.items():
				if task in pending and task in retry_map:
					tasks[key] = retry_map[task]()

			# Wait again with shorter timeout
			done2, pending2 = await asyncio.wait([t for t in tasks.values() if not t.done()], timeout=2.0)

			if pending2:
				for task in pending2:
					task.cancel()

		# Extract results, tracking which ones failed
		results = {}
		failed = []
		for key, task in tasks.items():
			if task.done() and not task.cancelled():
				try:
					results[key] = task.result()
				except Exception as e:
					self.logger.warning(f'CDP request {key} failed with exception: {e}')
					failed.append(key)
			else:
				self.logger.warning(f'CDP request {key} timed out')
				failed.append(key)

		# If any required tasks failed, raise an exception
		if failed:
			raise TimeoutError(f'CDP requests failed or timed out: {", ".join(failed)}')

		snapshot = results['snapshot']
		dom_tree = results['dom_tree']
		ax_tree = results['ax_tree']
		device_pixel_ratio = results['device_pixel_ratio']
		end_cdp_calls = time.time()
		cdp_calls_ms = (end_cdp_calls - start_cdp_calls) * 1000

		# Calculate total time for _get_all_trees and overhead
		start_snapshot_processing = time.time()

		# DEBUG: Log snapshot info and limit documents to prevent explosion
		if snapshot and 'documents' in snapshot:
			original_doc_count = len(snapshot['documents'])
			# Limit to max_iframes documents to prevent iframe explosion
			if original_doc_count > self.max_iframes:
				self.logger.warning(
					f'⚠️ Limiting processing of {original_doc_count} iframes on page to only first {self.max_iframes} to prevent crashes!'
				)
				snapshot['documents'] = snapshot['documents'][: self.max_iframes]

			total_nodes = sum(len(doc.get('nodes', [])) for doc in snapshot['documents'])
			self.logger.debug(f'🔍 DEBUG: Snapshot contains {len(snapshot["documents"])} frames with {total_nodes} total nodes')
			# Log iframe-specific info
			for doc_idx, doc in enumerate(snapshot['documents']):
				if doc_idx > 0:  # Not the main document
					self.logger.debug(
						f'🔍 DEBUG: Iframe #{doc_idx} {doc.get("frameId", "no-frame-id")} {doc.get("url", "no-url")} has {len(doc.get("nodes", []))} nodes'
					)

		snapshot_processing_ms = (time.time() - start_snapshot_processing) * 1000

		# Return with detailed timing breakdown
		return TargetAllTrees(
			snapshot=snapshot,
			dom_tree=dom_tree,
			ax_tree=ax_tree,
			device_pixel_ratio=device_pixel_ratio,
			cdp_timing={
				'iframe_scroll_detection_ms': iframe_scroll_ms,
				'js_listener_detection_ms': js_listener_detection_ms,
				'cdp_parallel_calls_ms': cdp_calls_ms,
				'snapshot_processing_ms': snapshot_processing_ms,
			},
			js_click_listener_backend_ids=js_click_listener_backend_ids if js_click_listener_backend_ids else None,
		)

	@observe_debug(ignore_input=True, ignore_output=True, name='get_dom_tree')
	async def get_dom_tree(
		self,
		target_id: TargetID,
		all_frames: dict | None = None,
		initial_html_frames: list[EnhancedDOMTreeNode] | None = None,
		initial_total_frame_offset: DOMRect | None = None,
		iframe_depth: int = 0,
	) -> tuple[EnhancedDOMTreeNode, dict[str, float]]:
		"""Get the DOM tree for a specific target.

		Args:
			target_id: Target ID of the page to get the DOM tree for.
			all_frames: Pre-fetched frame hierarchy to avoid redundant CDP calls (optional, lazy fetch if None)
			initial_html_frames: List of HTML frame nodes encountered so far
			initial_total_frame_offset: Accumulated coordinate offset
			iframe_depth: Current depth of iframe nesting to prevent infinite recursion

		Returns:
			Tuple of (enhanced_dom_tree_node, timing_info)
		"""
		timing_info: dict[str, float] = {}
		timing_start_total = time.time()

		# Get all trees from CDP (snapshot, DOM, AX, viewport ratio)
		start_get_trees = time.time()
		trees = await self._get_all_trees(target_id)
		get_trees_ms = (time.time() - start_get_trees) * 1000
		timing_info.update(trees.cdp_timing)
		timing_info['get_all_trees_total_ms'] = get_trees_ms

		dom_tree = trees.dom_tree
		ax_tree = trees.ax_tree
		snapshot = trees.snapshot
		device_pixel_ratio = trees.device_pixel_ratio
		js_click_listener_backend_ids = trees.js_click_listener_backend_ids or set()

		# Build AX tree lookup
		start_ax = time.time()
		ax_tree_lookup: dict[int, AXNode] = {
			ax_node['backendDOMNodeId']: ax_node for ax_node in ax_tree['nodes'] if 'backendDOMNodeId' in ax_node
		}
		timing_info['build_ax_lookup_ms'] = (time.time() - start_ax) * 1000

		enhanced_dom_tree_node_lookup: dict[int, EnhancedDOMTreeNode] = {}
		""" NodeId (NOT backend node id) -> enhanced dom tree node"""  # way to get the parent/content node

		# Parse snapshot data with everything calculated upfront
		start_snapshot = time.time()
		snapshot_lookup = build_snapshot_lookup(snapshot, device_pixel_ratio)
		timing_info['build_snapshot_lookup_ms'] = (time.time() - start_snapshot) * 1000

		async def _construct_enhanced_node(
			node: Node,
			html_frames: list[EnhancedDOMTreeNode] | None,
			total_frame_offset: DOMRect | None,
			all_frames: dict | None,
		) -> EnhancedDOMTreeNode:
			"""
			Recursively construct enhanced DOM tree nodes.

			Args:
				node: The DOM node to construct
				html_frames: List of HTML frame nodes encountered so far
				total_frame_offset: Accumulated coordinate translation from parent iframes (includes scroll corrections)
				all_frames: Pre-fetched frame hierarchy to avoid redundant CDP calls
			"""

			# Initialize lists if not provided
			if html_frames is None:
				html_frames = []

			# to get rid of the pointer references
			if total_frame_offset is None:
				total_frame_offset = DOMRect(x=0.0, y=0.0, width=0.0, height=0.0)
			else:
				total_frame_offset = DOMRect(
					total_frame_offset.x, total_frame_offset.y, total_frame_offset.width, total_frame_offset.height
				)

			# memoize the mf (I don't know if some nodes are duplicated)
			if node['nodeId'] in enhanced_dom_tree_node_lookup:
				return enhanced_dom_tree_node_lookup[node['nodeId']]

			ax_node = ax_tree_lookup.get(node['backendNodeId'])
			if ax_node:
				enhanced_ax_node = self._build_enhanced_ax_node(ax_node)
			else:
				enhanced_ax_node = None

			# To make attributes more readable
			attributes: dict[str, str] | None = None
			if 'attributes' in node and node['attributes']:
				attributes = {}
				for i in range(0, len(node['attributes']), 2):
					attributes[node['attributes'][i]] = node['attributes'][i + 1]

			shadow_root_type = None
			if 'shadowRootType' in node and node['shadowRootType']:
				try:
					shadow_root_type = node['shadowRootType']
				except ValueError:
					pass

			# Get snapshot data and calculate absolute position
			snapshot_data = snapshot_lookup.get(node['backendNodeId'], None)

			# DIAGNOSTIC: Log when interactive elements don't have snapshot data
			if not snapshot_data and node['nodeName'].upper() in ['INPUT', 'BUTTON', 'SELECT', 'TEXTAREA', 'A']:
				parent_has_shadow = False
				parent_info = ''
				if 'parentId' in node and node['parentId'] in enhanced_dom_tree_node_lookup:
					parent = enhanced_dom_tree_node_lookup[node['parentId']]
					if parent.shadow_root_type:
						parent_has_shadow = True
						parent_info = f'parent={parent.tag_name}(shadow={parent.shadow_root_type})'
				attr_str = ''
				if 'attributes' in node and node['attributes']:
					attrs_dict = {node['attributes'][i]: node['attributes'][i + 1] for i in range(0, len(node['attributes']), 2)}
					attr_str = f'name={attrs_dict.get("name", "N/A")} id={attrs_dict.get("id", "N/A")}'
				self.logger.debug(
					f'🔍 NO SNAPSHOT DATA for <{node["nodeName"]}> backendNodeId={node["backendNodeId"]} '
					f'{attr_str} {parent_info} (snapshot_lookup has {len(snapshot_lookup)} entries)'
				)

			absolute_position = None
			if snapshot_data and snapshot_data.bounds:
				absolute_position = DOMRect(
					x=snapshot_data.bounds.x + total_frame_offset.x,
					y=snapshot_data.bounds.y + total_frame_offset.y,
					width=snapshot_data.bounds.width,
					height=snapshot_data.bounds.height,
				)

			try:
				session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
				session_id = session.session_id
			except ValueError:
				# Target may have detached during DOM construction
				session_id = None

			dom_tree_node = EnhancedDOMTreeNode(
				node_id=node['nodeId'],
				backend_node_id=node['backendNodeId'],
				node_type=NodeType(node['nodeType']),
				node_name=node['nodeName'],
				node_value=node['nodeValue'],
				attributes=attributes or {},
				is_scrollable=node.get('isScrollable', None),
				frame_id=node.get('frameId', None),
				session_id=session_id,
				target_id=target_id,
				content_document=None,
				shadow_root_type=shadow_root_type,
				shadow_roots=None,
				parent_node=None,
				children_nodes=None,
				ax_node=enhanced_ax_node,
				snapshot_node=snapshot_data,
				is_visible=None,
				has_js_click_listener=node['backendNodeId'] in js_click_listener_backend_ids,
				absolute_position=absolute_position,
			)

			enhanced_dom_tree_node_lookup[node['nodeId']] = dom_tree_node

			if 'parentId' in node and node['parentId']:
				dom_tree_node.parent_node = enhanced_dom_tree_node_lookup[
					node['parentId']
				]  # parents should always be in the lookup

			# Check if this is an HTML frame node and add it to the list
			updated_html_frames = html_frames.copy()
			if node['nodeType'] == NodeType.ELEMENT_NODE.value and node['nodeName'] == 'HTML' and node.get('frameId') is not None:
				updated_html_frames.append(dom_tree_node)

				# and adjust the total frame offset by scroll
				if snapshot_data and snapshot_data.scrollRects:
					total_frame_offset.x -= snapshot_data.scrollRects.x
					total_frame_offset.y -= snapshot_data.scrollRects.y
					# DEBUG: Log iframe scroll information
					self.logger.debug(
						f'🔍 DEBUG: HTML frame scroll - scrollY={snapshot_data.scrollRects.y}, scrollX={snapshot_data.scrollRects.x}, frameId={node.get("frameId")}, nodeId={node["nodeId"]}'
					)

			# Calculate new iframe offset for content documents, accounting for iframe scroll
			if (
				(node['nodeName'].upper() == 'IFRAME' or node['nodeName'].upper() == 'FRAME')
				and snapshot_data
				and snapshot_data.bounds
			):
				if snapshot_data.bounds:
					updated_html_frames.append(dom_tree_node)

					total_frame_offset.x += snapshot_data.bounds.x
					total_frame_offset.y += snapshot_data.bounds.y

			if 'contentDocument' in node and node['contentDocument']:
				dom_tree_node.content_document = await _construct_enhanced_node(
					node['contentDocument'], updated_html_frames, total_frame_offset, all_frames
				)
				dom_tree_node.content_document.parent_node = dom_tree_node
				# forcefully set the parent node to the content document node (helps traverse the tree)

			if 'shadowRoots' in node and node['shadowRoots']:
				dom_tree_node.shadow_roots = []
				for shadow_root in node['shadowRoots']:
					shadow_root_node = await _construct_enhanced_node(
						shadow_root, updated_html_frames, total_frame_offset, all_frames
					)
					# forcefully set the parent node to the shadow root node (helps traverse the tree)
					shadow_root_node.parent_node = dom_tree_node
					dom_tree_node.shadow_roots.append(shadow_root_node)

			if 'children' in node and node['children']:
				dom_tree_node.children_nodes = []
				# Build set of shadow root node IDs to filter them out from children
				shadow_root_node_ids = set()
				if 'shadowRoots' in node and node['shadowRoots']:
					for shadow_root in node['shadowRoots']:
						shadow_root_node_ids.add(shadow_root['nodeId'])

				for child in node['children']:
					# Skip shadow roots - they should only be in shadow_roots list
					if child['nodeId'] in shadow_root_node_ids:
						continue
					dom_tree_node.children_nodes.append(
						await _construct_enhanced_node(child, updated_html_frames, total_frame_offset, all_frames)
					)

			# Set visibility using the collected HTML frames and viewport threshold
			dom_tree_node.is_visible = self.is_element_visible_according_to_all_parents(
				dom_tree_node, updated_html_frames, self.viewport_threshold
			)

			# DEBUG: Log visibility info for form elements in iframes
			if dom_tree_node.tag_name and dom_tree_node.tag_name.upper() in ['INPUT', 'SELECT', 'TEXTAREA', 'LABEL']:
				attrs = dom_tree_node.attributes or {}
				elem_id = attrs.get('id', '')
				elem_name = attrs.get('name', '')
				if (
					'city' in elem_id.lower()
					or 'city' in elem_name.lower()
					or 'state' in elem_id.lower()
					or 'state' in elem_name.lower()
					or 'zip' in elem_id.lower()
					or 'zip' in elem_name.lower()
				):
					self.logger.debug(
						f"🔍 DEBUG: Form element {dom_tree_node.tag_name} id='{elem_id}' name='{elem_name}' - visible={dom_tree_node.is_visible}, bounds={dom_tree_node.snapshot_node.bounds if dom_tree_node.snapshot_node else 'NO_SNAPSHOT'}"
					)

			# handle cross origin iframe (just recursively call the main function with the proper target if it exists in iframes)
			# only do this if the iframe is visible (otherwise it's not worth it)

			if (
				# TODO: hacky way to disable cross origin iframes for now
				self.cross_origin_iframes and node['nodeName'].upper() == 'IFRAME' and node.get('contentDocument', None) is None
			):  # None meaning there is no content
				# Check iframe depth to prevent infinite recursion
				if iframe_depth >= self.max_iframe_depth:
					self.logger.debug(
						f'Skipping iframe at depth {iframe_depth} to prevent infinite recursion (max depth: {self.max_iframe_depth})'
					)
				else:
					# Check if iframe is visible and large enough (>= 50px in both dimensions)
					should_process_iframe = False

					# First check if the iframe element itself is visible
					if dom_tree_node.is_visible:
						# Check iframe dimensions
						if dom_tree_node.snapshot_node and dom_tree_node.snapshot_node.bounds:
							bounds = dom_tree_node.snapshot_node.bounds
							width = bounds.width
							height = bounds.height

							# Only process if iframe is at least 50px in both dimensions
							if width >= 50 and height >= 50:
								should_process_iframe = True
								self.logger.debug(f'Processing cross-origin iframe: visible=True, width={width}, height={height}')
							else:
								self.logger.debug(
									f'Skipping small cross-origin iframe: width={width}, height={height} (needs >= 50px)'
								)
						else:
							self.logger.debug('Skipping cross-origin iframe: no bounds available')
					else:
						self.logger.debug('Skipping invisible cross-origin iframe')

					if should_process_iframe:
						# Lazy fetch all_frames only when actually needed (for cross-origin iframes)
						if all_frames is None:
							all_frames, _ = await self.browser_session.get_all_frames()

						# Use pre-fetched all_frames to find the iframe's target (no redundant CDP call)
						frame_id = node.get('frameId', None)

						# Fallback: if frameId is missing or not in all_frames, try URL matching via
						# the src attribute. This handles dynamically-injected iframes (e.g. HubSpot
						# popups, chat widgets) where Chrome hasn't yet registered the frameId in the
						# frame tree at DOM-snapshot time.
						if (not frame_id or frame_id not in all_frames) and attributes:
							src = attributes.get('src', '')
							if src:
								src_base = src.split('?')[0].rstrip('/')
								for fid, finfo in all_frames.items():
									frame_url = finfo.get('url', '').split('?')[0].rstrip('/')
									if frame_url and frame_url == src_base:
										frame_id = fid
										self.logger.debug(f'Matched cross-origin iframe by src URL: {src!r} -> frameId={fid}')
										break

						iframe_document_target = None
						if frame_id:
							frame_info = all_frames.get(frame_id)
							if frame_info and frame_info.get('frameTargetId'):
								iframe_target_id = frame_info['frameTargetId']
								# Use frameTargetId directly from all_frames — get_all_frames() already
								# validated connectivity. Do NOT gate on session_manager.get_target():
								# there is a race where _target_sessions is set (inside the lock in
								# _handle_target_attached) before _targets is populated (outside the
								# lock), so get_target() can transiently return None for a live target.
								iframe_target = self.browser_session.session_manager.get_target(iframe_target_id)
								iframe_document_target = {
									'targetId': iframe_target_id,
									'url': iframe_target.url if iframe_target else frame_info.get('url', ''),
									'title': iframe_target.title if iframe_target else frame_info.get('title', ''),
									'type': iframe_target.target_type if iframe_target else 'iframe',
								}

						# if target actually exists in one of the frames, just recursively build the dom tree for it
						if iframe_document_target:
							self.logger.debug(
								f'Getting content document for iframe {node.get("frameId", None)} at depth {iframe_depth + 1}'
							)
							try:
								content_document, _ = await self.get_dom_tree(
									target_id=iframe_document_target['targetId'],
									all_frames=all_frames,
									# Current config: if the cross origin iframe is AT ALL visible, include everything inside it
									initial_total_frame_offset=total_frame_offset,
									iframe_depth=iframe_depth + 1,
								)
								dom_tree_node.content_document = content_document
								dom_tree_node.content_document.parent_node = dom_tree_node
							except Exception as e:
								self.logger.debug(f'Failed to get DOM tree for cross-origin iframe {frame_id}: {e}')

			return dom_tree_node

		# Build enhanced DOM tree recursively
		# Note: all_frames stays None and will be lazily fetched inside _construct_enhanced_node
		# only if/when a cross-origin iframe is encountered
		start_construct = time.time()
		enhanced_dom_tree_node = await _construct_enhanced_node(
			dom_tree['root'], initial_html_frames, initial_total_frame_offset, all_frames
		)
		timing_info['construct_enhanced_tree_ms'] = (time.time() - start_construct) * 1000

		# Count hidden elements per iframe for LLM hints
		self._count_hidden_elements_in_iframes(enhanced_dom_tree_node)

		# Calculate total time for get_dom_tree
		total_get_dom_tree_ms = (time.time() - timing_start_total) * 1000
		timing_info['get_dom_tree_total_ms'] = total_get_dom_tree_ms

		# Calculate overhead in get_dom_tree (time not accounted for by sub-operations)
		tracked_sub_operations_ms = (
			timing_info.get('get_all_trees_total_ms', 0)
			+ timing_info.get('build_ax_lookup_ms', 0)
			+ timing_info.get('build_snapshot_lookup_ms', 0)
			+ timing_info.get('construct_enhanced_tree_ms', 0)
		)
		get_dom_tree_overhead_ms = total_get_dom_tree_ms - tracked_sub_operations_ms
		if get_dom_tree_overhead_ms > 0.1:
			timing_info['get_dom_tree_overhead_ms'] = get_dom_tree_overhead_ms

		return enhanced_dom_tree_node, timing_info

	@observe_debug(ignore_input=True, ignore_output=True, name='get_serialized_dom_tree')
	async def get_serialized_dom_tree(
		self, previous_cached_state: SerializedDOMState | None = None
	) -> tuple[SerializedDOMState, EnhancedDOMTreeNode, dict[str, float]]:
		"""Get the serialized DOM tree representation for LLM consumption.

		Returns:
			Tuple of (serialized_dom_state, enhanced_dom_tree_root, timing_info)
		"""
		timing_info: dict[str, float] = {}
		start_total = time.time()

		# Use current target (None means use current)
		assert self.browser_session.agent_focus_target_id is not None

		session_id = self.browser_session.id

		# Build DOM tree (includes CDP calls for snapshot, DOM, AX tree)
		# Note: all_frames is fetched lazily inside get_dom_tree only if cross-origin iframes need it
		enhanced_dom_tree, dom_tree_timing = await self.get_dom_tree(
			target_id=self.browser_session.agent_focus_target_id,
			all_frames=None,  # Lazy - will fetch if needed
		)

		# Add sub-timings from DOM tree construction
		timing_info.update(dom_tree_timing)

		# Serialize DOM tree for LLM
		start_serialize = time.time()

		serialized_dom_state, serializer_timing = DOMTreeSerializer(
			enhanced_dom_tree, previous_cached_state, paint_order_filtering=self.paint_order_filtering, session_id=session_id
		).serialize_accessible_elements()
		total_serialization_ms = (time.time() - start_serialize) * 1000

		# Add serializer sub-timings (convert to ms)
		for key, value in serializer_timing.items():
			timing_info[f'{key}_ms'] = value * 1000

		# Calculate untracked time in serialization
		tracked_serialization_ms = sum(value * 1000 for value in serializer_timing.values())
		serialization_overhead_ms = total_serialization_ms - tracked_serialization_ms
		if serialization_overhead_ms > 0.1:  # Only log if significant
			timing_info['serialization_overhead_ms'] = serialization_overhead_ms

		# Calculate total time for get_serialized_dom_tree
		total_get_serialized_dom_tree_ms = (time.time() - start_total) * 1000
		timing_info['get_serialized_dom_tree_total_ms'] = total_get_serialized_dom_tree_ms

		# Calculate overhead in get_serialized_dom_tree (time not accounted for)
		tracked_major_operations_ms = timing_info.get('get_dom_tree_total_ms', 0) + total_serialization_ms
		get_serialized_overhead_ms = total_get_serialized_dom_tree_ms - tracked_major_operations_ms
		if get_serialized_overhead_ms > 0.1:
			timing_info['get_serialized_dom_tree_overhead_ms'] = get_serialized_overhead_ms

		return serialized_dom_state, enhanced_dom_tree, timing_info

	@staticmethod
	def detect_pagination_buttons(selector_map: dict[int, EnhancedDOMTreeNode]) -> list[dict[str, str | int | bool]]:
		"""Detect pagination buttons from the selector map.

		Args:
			selector_map: Map of element indices to EnhancedDOMTreeNode

		Returns:
			List of pagination button information dicts with:
			- button_type: 'next', 'prev', 'first', 'last', 'page_number'
			- backend_node_id: Backend node ID for clicking
			- text: Button text/label
			- selector: XPath selector
			- is_disabled: Whether the button appears disabled
		"""
		pagination_buttons: list[dict[str, str | int | bool]] = []

		# Common pagination patterns to look for
		# `«` and `»` are ambiguous across sites, so treat them only as prev/next
		# fallback symbols and let word-based first/last signals win
		next_patterns = ['next', '>', '»', '→', 'siguiente', 'suivant', 'weiter', 'volgende']
		prev_patterns = ['prev', 'previous', '<', '«', '←', 'anterior', 'précédent', 'zurück', 'vorige']
		first_patterns = ['first', '⇤', 'primera', 'première', 'erste', 'eerste']
		last_patterns = ['last', '⇥', 'última', 'dernier', 'letzte', 'laatste']

		for index, node in selector_map.items():
			# Skip non-clickable elements
			if not node.snapshot_node or not node.snapshot_node.is_clickable:
				continue

			# Get element text and attributes
			text = node.get_all_children_text().lower().strip()
			aria_label = node.attributes.get('aria-label', '').lower()
			title = node.attributes.get('title', '').lower()
			class_name = node.attributes.get('class', '').lower()
			role = node.attributes.get('role', '').lower()

			# Combine all text sources for pattern matching
			all_text = f'{text} {aria_label} {title} {class_name}'.strip()

			# Check if it's disabled
			is_disabled = (
				node.attributes.get('disabled') == 'true'
				or node.attributes.get('aria-disabled') == 'true'
				or 'disabled' in class_name
			)

			button_type: str | None = None

			# Match specific first/last semantics before generic prev/next fallbacks.
			if any(pattern in all_text for pattern in first_patterns):
				button_type = 'first'
			# Check for last button
			elif any(pattern in all_text for pattern in last_patterns):
				button_type = 'last'
			# Check for next button
			elif any(pattern in all_text for pattern in next_patterns):
				button_type = 'next'
			# Check for previous button
			elif any(pattern in all_text for pattern in prev_patterns):
				button_type = 'prev'
			# Check for numeric page buttons (single or double digit)
			elif text.isdigit() and len(text) <= 2 and role in ['button', 'link', '']:
				button_type = 'page_number'

			if button_type:
				pagination_buttons.append(
					{
						'button_type': button_type,
						'backend_node_id': index,
						'text': node.get_all_children_text().strip() or aria_label or title,
						'selector': node.xpath,
						'is_disabled': is_disabled,
					}
				)

		return pagination_buttons