browser-use/browser_use/code_use/utils.py

"""Utility functions for code-use agent."""

import re


def truncate_message_content(content: str, max_length: int = 10000) -> str:
	"""Truncate message content to max_length characters for history."""
	if len(content) <= max_length:
		return content
	# Truncate and add marker
	return content[:max_length] + f'\n\n[... truncated {len(content) - max_length} characters for history]'


def detect_token_limit_issue(
	completion: str,
	completion_tokens: int | None,
	max_tokens: int | None,
	stop_reason: str | None,
) -> tuple[bool, str | None]:
	"""
	Detect if the LLM response hit token limits or is repetitive garbage.

	Returns: (is_problematic, error_message)
	"""
	# Check 1: Stop reason indicates max_tokens
	if stop_reason == 'max_tokens':
		return True, f'Response terminated due to max_tokens limit (stop_reason: {stop_reason})'

	# Check 2: Used 90%+ of max_tokens (if we have both values)
	if completion_tokens is not None and max_tokens is not None and max_tokens > 0:
		usage_ratio = completion_tokens / max_tokens
		if usage_ratio >= 0.9:
			return True, f'Response used {usage_ratio:.1%} of max_tokens ({completion_tokens}/{max_tokens})'

	# Check 3: Last 6 characters repeat 40+ times (repetitive garbage)
	if len(completion) >= 6:
		last_6 = completion[-6:]
		repetition_count = completion.count(last_6)
		if repetition_count >= 40:
			return True, f'Repetitive output detected: last 6 chars "{last_6}" appears {repetition_count} times'

	return False, None


def extract_url_from_task(task: str) -> str | None:
	"""Extract URL from task string using naive pattern matching."""
	# Remove email addresses from task before looking for URLs
	task_without_emails = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', task)

	# Look for common URL patterns
	patterns = [
		r'https?://[^\s<>"\']+',  # Full URLs with http/https
		r'(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?',  # Domain names with subdomains and optional paths
	]

	found_urls = []
	for pattern in patterns:
		matches = re.finditer(pattern, task_without_emails)
		for match in matches:
			url = match.group(0)

			# Remove trailing punctuation that's not part of URLs
			url = re.sub(r'[.,;:!?()\[\]]+$', '', url)
			# Add https:// if missing
			if not url.startswith(('http://', 'https://')):
				url = 'https://' + url
			found_urls.append(url)

	unique_urls = list(set(found_urls))
	# If multiple URLs found, skip auto-navigation to avoid ambiguity
	if len(unique_urls) > 1:
		return None

	# If exactly one URL found, return it
	if len(unique_urls) == 1:
		return unique_urls[0]

	return None


def extract_code_blocks(text: str) -> dict[str, str]:
	"""Extract all code blocks from markdown response.

	Supports:
	- ```python, ```js, ```javascript, ```bash, ```markdown, ```md
	- Named blocks: ```js variable_name → saved as 'variable_name' in namespace
	- Nested blocks: Use 4+ backticks for outer block when inner content has 3 backticks

	Returns dict mapping block_name -> content

	Note: Python blocks are NO LONGER COMBINED. Each python block executes separately
	to allow sequential execution with JS/bash blocks in between.
	"""
	# Pattern to match code blocks with language identifier and optional variable name
	# Matches: ```lang\n or ```lang varname\n or ````+lang\n (4+ backticks for nested blocks)
	# Uses non-greedy matching and backreferences to match opening/closing backticks
	pattern = r'(`{3,})(\w+)(?:\s+(\w+))?\n(.*?)\1(?:\n|$)'
	matches = re.findall(pattern, text, re.DOTALL)

	blocks: dict[str, str] = {}
	python_block_counter = 0

	for backticks, lang, var_name, content in matches:
		lang = lang.lower()

		# Normalize language names
		if lang in ('javascript', 'js'):
			lang_normalized = 'js'
		elif lang in ('markdown', 'md'):
			lang_normalized = 'markdown'
		elif lang in ('sh', 'shell'):
			lang_normalized = 'bash'
		elif lang == 'python':
			lang_normalized = 'python'
		else:
			# Unknown language, skip
			continue

		# Only process supported types
		if lang_normalized in ('python', 'js', 'bash', 'markdown'):
			content = content.rstrip()  # Only strip trailing whitespace, preserve leading for indentation
			if content:
				# Determine the key to use
				if var_name:
					# Named block - use the variable name
					block_key = var_name
					blocks[block_key] = content
				elif lang_normalized == 'python':
					# Unnamed Python blocks - give each a unique key to preserve order
					block_key = f'python_{python_block_counter}'
					blocks[block_key] = content
					python_block_counter += 1
				else:
					# Other unnamed blocks (js, bash, markdown) - keep last one only
					blocks[lang_normalized] = content

	# If we have multiple python blocks, mark the first one as 'python' for backward compat
	if python_block_counter > 0:
		blocks['python'] = blocks['python_0']

	# Fallback: if no python block but there's generic ``` block, treat as python
	if python_block_counter == 0 and 'python' not in blocks:
		generic_pattern = r'```\n(.*?)```'
		generic_matches = re.findall(generic_pattern, text, re.DOTALL)
		if generic_matches:
			combined = '\n\n'.join(m.strip() for m in generic_matches if m.strip())
			if combined:
				blocks['python'] = combined

	return blocks