mirror of
https://github.com/browser-use/browser-use
synced 2026-04-22 17:45:09 +02:00
151 lines
5.1 KiB
Python
151 lines
5.1 KiB
Python
"""Utility functions for code-use agent."""
|
|
|
|
import re
|
|
|
|
|
|
def truncate_message_content(content: str, max_length: int = 10000) -> str:
|
|
"""Truncate message content to max_length characters for history."""
|
|
if len(content) <= max_length:
|
|
return content
|
|
# Truncate and add marker
|
|
return content[:max_length] + f'\n\n[... truncated {len(content) - max_length} characters for history]'
|
|
|
|
|
|
def detect_token_limit_issue(
|
|
completion: str,
|
|
completion_tokens: int | None,
|
|
max_tokens: int | None,
|
|
stop_reason: str | None,
|
|
) -> tuple[bool, str | None]:
|
|
"""
|
|
Detect if the LLM response hit token limits or is repetitive garbage.
|
|
|
|
Returns: (is_problematic, error_message)
|
|
"""
|
|
# Check 1: Stop reason indicates max_tokens
|
|
if stop_reason == 'max_tokens':
|
|
return True, f'Response terminated due to max_tokens limit (stop_reason: {stop_reason})'
|
|
|
|
# Check 2: Used 90%+ of max_tokens (if we have both values)
|
|
if completion_tokens is not None and max_tokens is not None and max_tokens > 0:
|
|
usage_ratio = completion_tokens / max_tokens
|
|
if usage_ratio >= 0.9:
|
|
return True, f'Response used {usage_ratio:.1%} of max_tokens ({completion_tokens}/{max_tokens})'
|
|
|
|
# Check 3: Last 6 characters repeat 40+ times (repetitive garbage)
|
|
if len(completion) >= 6:
|
|
last_6 = completion[-6:]
|
|
repetition_count = completion.count(last_6)
|
|
if repetition_count >= 40:
|
|
return True, f'Repetitive output detected: last 6 chars "{last_6}" appears {repetition_count} times'
|
|
|
|
return False, None
|
|
|
|
|
|
def extract_url_from_task(task: str) -> str | None:
|
|
"""Extract URL from task string using naive pattern matching."""
|
|
# Remove email addresses from task before looking for URLs
|
|
task_without_emails = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', task)
|
|
|
|
# Look for common URL patterns
|
|
patterns = [
|
|
r'https?://[^\s<>"\']+', # Full URLs with http/https
|
|
r'(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?', # Domain names with subdomains and optional paths
|
|
]
|
|
|
|
found_urls = []
|
|
for pattern in patterns:
|
|
matches = re.finditer(pattern, task_without_emails)
|
|
for match in matches:
|
|
url = match.group(0)
|
|
|
|
# Remove trailing punctuation that's not part of URLs
|
|
url = re.sub(r'[.,;:!?()\[\]]+$', '', url)
|
|
# Add https:// if missing
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
found_urls.append(url)
|
|
|
|
unique_urls = list(set(found_urls))
|
|
# If multiple URLs found, skip auto-navigation to avoid ambiguity
|
|
if len(unique_urls) > 1:
|
|
return None
|
|
|
|
# If exactly one URL found, return it
|
|
if len(unique_urls) == 1:
|
|
return unique_urls[0]
|
|
|
|
return None
|
|
|
|
|
|
def extract_code_blocks(text: str) -> dict[str, str]:
|
|
"""Extract all code blocks from markdown response.
|
|
|
|
Supports:
|
|
- ```python, ```js, ```javascript, ```bash, ```markdown, ```md
|
|
- Named blocks: ```js variable_name → saved as 'variable_name' in namespace
|
|
- Nested blocks: Use 4+ backticks for outer block when inner content has 3 backticks
|
|
|
|
Returns dict mapping block_name -> content
|
|
|
|
Note: Python blocks are NO LONGER COMBINED. Each python block executes separately
|
|
to allow sequential execution with JS/bash blocks in between.
|
|
"""
|
|
# Pattern to match code blocks with language identifier and optional variable name
|
|
# Matches: ```lang\n or ```lang varname\n or ````+lang\n (4+ backticks for nested blocks)
|
|
# Uses non-greedy matching and backreferences to match opening/closing backticks
|
|
pattern = r'(`{3,})(\w+)(?:\s+(\w+))?\n(.*?)\1(?:\n|$)'
|
|
matches = re.findall(pattern, text, re.DOTALL)
|
|
|
|
blocks: dict[str, str] = {}
|
|
python_block_counter = 0
|
|
|
|
for backticks, lang, var_name, content in matches:
|
|
lang = lang.lower()
|
|
|
|
# Normalize language names
|
|
if lang in ('javascript', 'js'):
|
|
lang_normalized = 'js'
|
|
elif lang in ('markdown', 'md'):
|
|
lang_normalized = 'markdown'
|
|
elif lang in ('sh', 'shell'):
|
|
lang_normalized = 'bash'
|
|
elif lang == 'python':
|
|
lang_normalized = 'python'
|
|
else:
|
|
# Unknown language, skip
|
|
continue
|
|
|
|
# Only process supported types
|
|
if lang_normalized in ('python', 'js', 'bash', 'markdown'):
|
|
content = content.rstrip() # Only strip trailing whitespace, preserve leading for indentation
|
|
if content:
|
|
# Determine the key to use
|
|
if var_name:
|
|
# Named block - use the variable name
|
|
block_key = var_name
|
|
blocks[block_key] = content
|
|
elif lang_normalized == 'python':
|
|
# Unnamed Python blocks - give each a unique key to preserve order
|
|
block_key = f'python_{python_block_counter}'
|
|
blocks[block_key] = content
|
|
python_block_counter += 1
|
|
else:
|
|
# Other unnamed blocks (js, bash, markdown) - keep last one only
|
|
blocks[lang_normalized] = content
|
|
|
|
# If we have multiple python blocks, mark the first one as 'python' for backward compat
|
|
if python_block_counter > 0:
|
|
blocks['python'] = blocks['python_0']
|
|
|
|
# Fallback: if no python block but there's generic ``` block, treat as python
|
|
if python_block_counter == 0 and 'python' not in blocks:
|
|
generic_pattern = r'```\n(.*?)```'
|
|
generic_matches = re.findall(generic_pattern, text, re.DOTALL)
|
|
if generic_matches:
|
|
combined = '\n\n'.join(m.strip() for m in generic_matches if m.strip())
|
|
if combined:
|
|
blocks['python'] = combined
|
|
|
|
return blocks
|