Merge branch 'main' into oracle_oci_integration

2026-05-06 17:52:15 +02:00 · 2025-09-18 00:20:39 +05:30
parent b3dc3151dc c777fd9402
commit bb1e9bc4ec
8 changed files with 216 additions and 77 deletions
--- a/browser_use/agent/prompts.py
+++ b/browser_use/agent/prompts.py
@@ -2,6 +2,7 @@ import importlib.resources
 from datetime import datetime
 from typing import TYPE_CHECKING, Literal, Optional

+from browser_use.dom.views import NodeType, SimplifiedNode
 from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL, SystemMessage, UserMessage
 from browser_use.observability import observe_debug
 from browser_use.utils import is_new_tab_page
@@ -112,8 +113,93 @@ class AgentMessagePrompt:
 		self.sample_images = sample_images or []
 		assert self.browser_state

+	def _extract_page_statistics(self) -> dict[str, int]:
+		"""Extract high-level page statistics from DOM tree for LLM context"""
+		stats = {
+			'links': 0,
+			'iframes': 0,
+			'shadow_open': 0,
+			'shadow_closed': 0,
+			'scroll_containers': 0,
+			'images': 0,
+			'interactive_elements': 0,
+			'total_elements': 0,
+		}
+
+		if not self.browser_state.dom_state or not self.browser_state.dom_state._root:
+			return stats
+
+		def traverse_node(node: SimplifiedNode) -> None:
+			"""Recursively traverse simplified DOM tree to count elements"""
+			if not node or not node.original_node:
+				return
+
+			original = node.original_node
+			stats['total_elements'] += 1
+
+			# Count by node type and tag
+			if original.node_type == NodeType.ELEMENT_NODE:
+				tag = original.tag_name.lower() if original.tag_name else ''
+
+				if tag == 'a':
+					stats['links'] += 1
+				elif tag in ('iframe', 'frame'):
+					stats['iframes'] += 1
+				elif tag == 'img':
+					stats['images'] += 1
+
+				# Check if scrollable
+				if original.is_actually_scrollable:
+					stats['scroll_containers'] += 1
+
+				# Check if interactive
+				if node.interactive_index is not None:
+					stats['interactive_elements'] += 1
+
+				# Check if this element hosts shadow DOM
+				if node.is_shadow_host:
+					# Check if any shadow children are closed
+					has_closed_shadow = any(
+						child.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE
+						and child.original_node.shadow_root_type
+						and child.original_node.shadow_root_type.lower() == 'closed'
+						for child in node.children
+					)
+					if has_closed_shadow:
+						stats['shadow_closed'] += 1
+					else:
+						stats['shadow_open'] += 1
+
+			elif original.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
+				# Shadow DOM fragment - these are the actual shadow roots
+				# But don't double-count since we count them at the host level above
+				pass
+
+			# Traverse children
+			for child in node.children:
+				traverse_node(child)
+
+		traverse_node(self.browser_state.dom_state._root)
+		return stats
+
 	@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
 	def _get_browser_state_description(self) -> str:
+		# Extract page statistics first
+		page_stats = self._extract_page_statistics()
+
+		# Format statistics for LLM
+		stats_text = '<page_stats>'
+		if page_stats['total_elements'] < 10:
+			stats_text += 'Page appears empty (SPA not loaded?) - '
+		stats_text += f'{page_stats["links"]} links, {page_stats["interactive_elements"]} interactive, '
+		stats_text += f'{page_stats["iframes"]} iframes, {page_stats["scroll_containers"]} scroll containers'
+		if page_stats['shadow_open'] > 0 or page_stats['shadow_closed'] > 0:
+			stats_text += f', {page_stats["shadow_open"]} shadow(open), {page_stats["shadow_closed"]} shadow(closed)'
+		if page_stats['images'] > 0:
+			stats_text += f', {page_stats["images"]} images'
+		stats_text += f', {page_stats["total_elements"]} total elements'
+		stats_text += '</page_stats>\n\n'
+
 		elements_text = self.browser_state.dom_state.llm_representation(include_attributes=self.include_attributes)

 		if len(elements_text) > self.max_clickable_elements_length:
@@ -122,9 +208,8 @@ class AgentMessagePrompt:
 		else:
 			truncated_text = ''

-		has_content_above = (self.browser_state.pixels_above or 0) > 0
-		has_content_below = (self.browser_state.pixels_below or 0) > 0
-
+		has_content_above = False
+		has_content_below = False
 		# Enhanced page information for the model
 		page_info_text = ''
 		if self.browser_state.page_info:
@@ -132,10 +217,11 @@ class AgentMessagePrompt:
 			# Compute page statistics dynamically
 			pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
 			pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
+			has_content_above = pages_above > 0
+			has_content_below = pages_below > 0
 			total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
 			current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1)
 			page_info_text = '<page_info>'
-			page_info_text += f'Viewport size: {pi.viewport_width}x{pi.viewport_height}px, Total page size: {pi.page_width}x{pi.page_height}px, '
 			page_info_text += f'{pages_above:.1f} pages above, '
 			page_info_text += f'{pages_below:.1f} pages below, '
 			page_info_text += f'{total_pages:.1f} total pages'
@@ -146,18 +232,14 @@ class AgentMessagePrompt:
 				if self.browser_state.page_info:
 					pi = self.browser_state.page_info
 					pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
-					elements_text = f'... {self.browser_state.pixels_above} pixels above ({pages_above:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
-				else:
-					elements_text = f'... {self.browser_state.pixels_above} pixels above - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
+					elements_text = f'... {pages_above:.1f} pages above - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
 			else:
 				elements_text = f'[Start of page]\n{elements_text}'
 			if has_content_below:
 				if self.browser_state.page_info:
 					pi = self.browser_state.page_info
 					pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
-					elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below ({pages_below:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...'
-				else:
-					elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below - scroll to see more or extract structured data if you are looking for specific information ...'
+					elements_text = f'{elements_text}\n... {pages_below:.1f} pages below - scroll to see more or extract structured data if you are looking for specific information ...'
 			else:
 				elements_text = f'{elements_text}\n[End of page]'
 		else:
@@ -190,7 +272,7 @@ class AgentMessagePrompt:
 		if self.include_recent_events and self.browser_state.recent_events:
 			recent_events_text = f'Recent browser events: {self.browser_state.recent_events}\n'

-		browser_state = f"""{current_tab_text}
+		browser_state = f"""{stats_text}{current_tab_text}
 Available tabs:
 {tabs_text}
 {page_info_text}
@@ -205,9 +287,6 @@ Available tabs:
 		else:
 			step_info_description = ''

-		time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
-		step_info_description += f'Current date and time: {time_str}'
-
 		time_str = datetime.now().strftime('%Y-%m-%d')
 		step_info_description += f'Current date: {time_str}'

--- a/browser_use/browser/watchdogs/default_action_watchdog.py
+++ b/browser_use/browser/watchdogs/default_action_watchdog.py
@@ -71,7 +71,7 @@ class DefaultActionWatchdog(BaseWatchdog):
 				msg = f'Downloaded file to {download_path}'
 				self.logger.info(f'💾 {msg}')
 			else:
-				msg = f'Clicked button with index {index_for_logging}: {element_node.get_all_children_text(max_depth=2)}'
+				msg = f'Clicked button {element_node.node_name}: {element_node.get_all_children_text(max_depth=2)}'
 				self.logger.debug(f'🖱️ {msg}')
 			self.logger.debug(f'Element xpath: {element_node.xpath}')

@@ -1912,7 +1912,7 @@ class DefaultActionWatchdog(BaseWatchdog):
 			self.logger.error(msg)
 			raise BrowserError(message=msg, long_term_memory=msg)
 		except Exception as e:
-			msg = f'Failed to get dropdown options for element with index {index_for_logging}'
+			msg = 'Failed to get dropdown options'
 			error_msg = f'{msg}: {str(e)}'
 			self.logger.error(error_msg)
 			raise BrowserError(
--- a/browser_use/dom/serializer/serializer.py
+++ b/browser_use/dom/serializer/serializer.py
@@ -137,13 +137,16 @@ class DOMTreeSerializer:
 			return None

 		if node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
-			# Super simple pass-through for shadow DOM elements
+			# ENHANCED shadow DOM processing - always include shadow content
 			simplified = SimplifiedNode(original_node=node, children=[])
 			for child in node.children_and_shadow_roots:
 				simplified_child = self._create_simplified_tree(child, depth + 1)
 				if simplified_child:
 					simplified.children.append(simplified_child)
-			return simplified
+
+			# Always return shadow DOM fragments, even if children seem empty
+			# Shadow DOM often contains the actual interactive content in SPAs
+			return simplified if simplified.children else SimplifiedNode(original_node=node, children=[])

 		elif node.node_type == NodeType.ELEMENT_NODE:
 			# Skip non-content elements
@@ -161,19 +164,26 @@ class DOMTreeSerializer:

 			is_visible = node.is_visible
 			is_scrollable = node.is_actually_scrollable
+			has_shadow_content = bool(node.children_and_shadow_roots)

-			# Include if interactive (regardless of visibility), or scrollable, or has children to process
+			# ENHANCED SHADOW DOM DETECTION: Include shadow hosts even if not visible
+			is_shadow_host = any(child.node_type == NodeType.DOCUMENT_FRAGMENT_NODE for child in node.children_and_shadow_roots)

-			if is_visible or is_scrollable or bool(node.children_and_shadow_roots):
-				simplified = SimplifiedNode(original_node=node, children=[])
-				# simplified._analysis = analysis  # Store analysis for grouping
+			# Include if interactive (regardless of visibility), scrollable, has children, or is shadow host
+			if is_visible or is_scrollable or has_shadow_content or is_shadow_host:
+				simplified = SimplifiedNode(original_node=node, children=[], is_shadow_host=is_shadow_host)

-				# Process children
+				# Process ALL children including shadow roots with enhanced logging
 				for child in node.children_and_shadow_roots:
 					simplified_child = self._create_simplified_tree(child, depth + 1)
 					if simplified_child:
 						simplified.children.append(simplified_child)

+				# SHADOW DOM SPECIAL CASE: Always include shadow hosts even if not visible
+				# Many SPA frameworks (React, Vue) render content in shadow DOM
+				if is_shadow_host and simplified.children:
+					return simplified
+
 				# Return if meaningful or has meaningful children
 				if is_visible or is_scrollable or simplified.children:
 					return simplified
@@ -449,23 +459,34 @@ class DOMTreeSerializer:
 				# Build attributes string
 				attributes_html_str = DOMTreeSerializer._build_attributes_string(node.original_node, include_attributes, '')

-				# Build the line
+				# Build the line with shadow host indicator
+				shadow_prefix = ''
+				if node.is_shadow_host:
+					# Check if any shadow children are closed
+					has_closed_shadow = any(
+						child.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE
+						and child.original_node.shadow_root_type
+						and child.original_node.shadow_root_type.lower() == 'closed'
+						for child in node.children
+					)
+					shadow_prefix = '|SHADOW(closed)|' if has_closed_shadow else '|SHADOW(open)|'
+
 				if should_show_scroll and node.interactive_index is None:
 					# Scrollable container but not clickable
-					line = f'{depth_str}|SCROLL|<{node.original_node.tag_name}'
+					line = f'{depth_str}{shadow_prefix}|SCROLL|<{node.original_node.tag_name}'
 				elif node.interactive_index is not None:
 					# Clickable (and possibly scrollable)
 					new_prefix = '*' if node.is_new else ''
 					scroll_prefix = '|SCROLL+' if should_show_scroll else '['
-					line = f'{depth_str}{new_prefix}{scroll_prefix}{node.interactive_index}]<{node.original_node.tag_name}'
+					line = f'{depth_str}{shadow_prefix}{new_prefix}{scroll_prefix}{node.interactive_index}]<{node.original_node.tag_name}'
 				elif node.original_node.tag_name.upper() == 'IFRAME':
 					# Iframe element (not interactive)
-					line = f'{depth_str}|IFRAME|<{node.original_node.tag_name}'
+					line = f'{depth_str}{shadow_prefix}|IFRAME|<{node.original_node.tag_name}'
 				elif node.original_node.tag_name.upper() == 'FRAME':
 					# Frame element (not interactive)
-					line = f'{depth_str}|FRAME|<{node.original_node.tag_name}'
+					line = f'{depth_str}{shadow_prefix}|FRAME|<{node.original_node.tag_name}'
 				else:
-					line = f'{depth_str}<{node.original_node.tag_name}'
+					line = f'{depth_str}{shadow_prefix}<{node.original_node.tag_name}'

 				if attributes_html_str:
 					line += f' {attributes_html_str}'
@@ -480,6 +501,25 @@ class DOMTreeSerializer:

 				formatted_text.append(line)

+		elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
+			# Shadow DOM representation - show clearly to LLM
+			if node.original_node.shadow_root_type and node.original_node.shadow_root_type.lower() == 'closed':
+				formatted_text.append(f'{depth_str}▼ Shadow Content (Closed)')
+			else:
+				formatted_text.append(f'{depth_str}▼ Shadow Content (Open)')
+
+			next_depth += 1
+
+			# Process shadow DOM children
+			for child in node.children:
+				child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth)
+				if child_text:
+					formatted_text.append(child_text)
+
+			# Close shadow DOM indicator
+			if node.children:  # Only show close if we had content
+				formatted_text.append(f'{depth_str}▲ Shadow Content End')
+
 		elif node.original_node.node_type == NodeType.TEXT_NODE:
 			# Include visible text
 			is_visible = node.original_node.snapshot_node and node.original_node.is_visible
@@ -492,11 +532,12 @@ class DOMTreeSerializer:
 				clean_text = node.original_node.node_value.strip()
 				formatted_text.append(f'{depth_str}{clean_text}')

-		# Process children
-		for child in node.children:
-			child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth)
-			if child_text:
-				formatted_text.append(child_text)
+		# Process children (for non-shadow elements)
+		if node.original_node.node_type != NodeType.DOCUMENT_FRAGMENT_NODE:
+			for child in node.children:
+				child_text = DOMTreeSerializer.serialize_tree(child, include_attributes, next_depth)
+				if child_text:
+					formatted_text.append(child_text)

 		return '\n'.join(formatted_text)

--- a/browser_use/dom/views.py
+++ b/browser_use/dom/views.py
@@ -19,6 +19,8 @@ DEFAULT_INCLUDE_ATTRIBUTES = [
 	'title',
 	'type',
 	'checked',
+	# 'class',
+	'id',
 	'name',
 	'role',
 	'value',
@@ -51,6 +53,51 @@ DEFAULT_INCLUDE_ATTRIBUTES = [
 	'ax_name',
 ]

+STATIC_ATTRIBUTES = {
+	'class',
+	'id',
+	'name',
+	'type',
+	'placeholder',
+	'aria-label',
+	'title',
+	# 'aria-expanded',
+	'role',
+	'data-testid',
+	'data-test',
+	'data-cy',
+	'data-selenium',
+	'for',
+	'required',
+	'disabled',
+	'readonly',
+	'checked',
+	'selected',
+	'multiple',
+	'href',
+	'target',
+	'rel',
+	'aria-describedby',
+	'aria-labelledby',
+	'aria-controls',
+	'aria-owns',
+	'aria-live',
+	'aria-atomic',
+	'aria-busy',
+	'aria-disabled',
+	'aria-hidden',
+	'aria-pressed',
+	'aria-checked',
+	'aria-selected',
+	'tabindex',
+	'alt',
+	'src',
+	'lang',
+	'itemscope',
+	'itemtype',
+	'itemprop',
+}
+

@dataclass
 class CurrentPageTargets:
@@ -93,6 +140,7 @@ class SimplifiedNode:

 	ignored_by_paint_order: bool = False  # More info in dom/serializer/paint_order.py
 	excluded_by_parent: bool = False  # New field for bbox filtering
+	is_shadow_host: bool = False  # New field for shadow DOM hosts

 	def _clean_original_node_json(self, node_json: dict) -> dict:
 		"""Recursively remove children_nodes and shadow_roots from original_node JSON."""
@@ -683,8 +731,9 @@ class EnhancedDOMTreeNode:
 		parent_branch_path = self._get_parent_branch_path()
 		parent_branch_path_string = '/'.join(parent_branch_path)

-		# Get attributes hash
-		attributes_string = ''.join(f'{key}={value}' for key, value in self.attributes.items())
+		attributes_string = ''.join(
+			f'{k}={v}' for k, v in sorted((k, v) for k, v in self.attributes.items() if k in STATIC_ATTRIBUTES)
+		)

 		# Combine both for final hash
 		combined_string = f'{parent_branch_path_string}|{attributes_string}'
--- a/browser_use/sync/service.py
+++ b/browser_use/sync/service.py
@@ -113,14 +113,14 @@ class CloudSync:
 						f'Failed to send sync event: POST {response.request.url} {response.status_code} - {response.text}'
 					)
 		except httpx.TimeoutException:
-			logger.warning(f'Event send timed out after 10 seconds: {event}')
+			logger.debug(f'Event send timed out after 10 seconds: {event}')
 		except httpx.ConnectError as e:
 			# logger.warning(f'⚠️ Failed to connect to cloud service at {self.base_url}: {e}')
 			pass
 		except httpx.HTTPError as e:
-			logger.warning(f'HTTP error sending event {event}: {type(e).__name__}: {e}')
+			logger.debug(f'HTTP error sending event {event}: {type(e).__name__}: {e}')
 		except Exception as e:
-			logger.warning(f'Unexpected error sending event {event}: {type(e).__name__}: {e}')
+			logger.debug(f'Unexpected error sending event {event}: {type(e).__name__}: {e}')

 	async def _background_auth(self, agent_session_id: str) -> None:
 		"""Run authentication in background or show cloud URL if already authenticated"""
--- a/browser_use/tools/service.py
+++ b/browser_use/tools/service.py
@@ -293,7 +293,7 @@ class Tools(Generic[Context]):
 				await event
 				# Wait for handler to complete and get any exception or metadata
 				click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
-				memory = f'Clicked element with index {params.index}'
+				memory = 'Clicked element'

 				if params.while_holding_ctrl:
 					memory += ' and opened in new tab'
--- a/examples/features/rerun_history.py
+++ b/examples/features/rerun_history.py
@@ -24,43 +24,17 @@ from browser_use.llm.openai.chat import ChatOpenAI

 async def main():
 	# Example task to demonstrate history saving and rerunning
-	task = 'Go to GitHub and find the browser-use repository'
 	history_file = Path('agent_history.json')
+	task = 'Go to https://browser-use.github.io/stress-tests/challenges/ember-form.html and fill the form with example data.'
 	llm = ChatOpenAI(model='gpt-4.1-mini')

-	# Step 1: Run agent and save history
-	print('🚀 Running agent and saving history...')
-
-	agent = Agent(
-		task=task,
-		llm=llm,
-	)
-
-	# Run the agent
-	history = await agent.run(max_steps=5)
-
-	# Save the history for later rerun
+	agent = Agent(task=task, llm=llm, max_actions_per_step=1)
+	await agent.run(max_steps=5)
 	agent.save_history(history_file)

-	print(f'✅ History saved to {history_file}')
-	print(f'📊 Completed {len(history.history)} steps')
+	rerun_agent = Agent(task='', llm=llm)

-	# Step 2: Load and rerun the history
-	print('\n🔄 Loading and rerunning history...')
-
-	# Create new agent for rerunning (task can be empty since we're replaying)
-	rerun_agent = Agent(
-		task='',
-		llm=llm,
-	)
-
-	# Load and rerun the saved history
-	results = await rerun_agent.load_and_rerun(
-		history_file=history_file,
-		max_retries=3,  # Retry failed actions up to 3 times
-		skip_failures=True,  # Continue even if some actions fail
-		delay_between_actions=1.0,  # Wait 1 second between actions
-	)
+	await rerun_agent.load_and_rerun(history_file)


 if __name__ == '__main__':
--- a/tests/ci/test_browser_event_ClickElementEvent.py
+++ b/tests/ci/test_browser_event_ClickElementEvent.py
@@ -188,9 +188,7 @@ class TestClickElementEvent:
 		result_text = result.extracted_content or result.long_term_memory
 		# Core logic validation: Verify click was successful
 		assert result_text is not None
-		assert f'Clicked element with index {button_index}' in result_text, (
-			f'Expected click confirmation in result content, got: {result_text}'
-		)
+		assert 'Clicked element' in result_text, f'Expected click confirmation in result content, got: {result_text}'
 		# Note: The click action doesn't include button text in the result, only the index

 		# Verify the click actually had an effect on the page using CDP
@@ -262,9 +260,7 @@ class TestClickElementEvent:
 		assert isinstance(result, ActionResult)
 		result_text = result.extracted_content or result.long_term_memory
 		assert result_text is not None
-		assert f'Clicked element with index {link_index}' in result_text, (
-			f'Expected click confirmation in result content, got: {result_text}'
-		)
+		assert 'Clicked element' in result_text, f'Expected click confirmation in result content, got: {result_text}'

 		# Verify that a new tab was opened
 		tabs = await browser_session.get_tabs()