From 65f87b7fcae6c329cb1ccb3fd7154ec5990703a1 Mon Sep 17 00:00:00 2001 From: Laith Weinberger <70768382+laithrw@users.noreply.github.com> Date: Sat, 11 Apr 2026 18:16:24 -0400 Subject: [PATCH] fix sensitive_data redaction order to prevent substring leaks --- browser_use/agent/message_manager/service.py | 27 ++++++-------------- browser_use/agent/views.py | 21 +++------------ browser_use/utils.py | 24 +++++++++++++++++ 3 files changed, 35 insertions(+), 37 deletions(-) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index a2be2883c..6c7cae11a 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -25,7 +25,12 @@ from browser_use.llm.messages import ( UserMessage, ) from browser_use.observability import observe_debug -from browser_use.utils import match_url_with_domain_pattern, time_execution_sync +from browser_use.utils import ( + collect_sensitive_data_values, + match_url_with_domain_pattern, + redact_sensitive_string, + time_execution_sync, +) logger = logging.getLogger(__name__) @@ -573,30 +578,14 @@ class MessageManager: if not self.sensitive_data: return value - # Collect all sensitive values, immediately converting old format to new format - sensitive_values: dict[str, str] = {} - - # Process all sensitive data entries - for key_or_domain, content in self.sensitive_data.items(): - if isinstance(content, dict): - # Already in new format: {domain: {key: value}} - for key, val in content.items(): - if val: # Skip empty values - sensitive_values[key] = val - elif content: # Old format: {key: value} - convert to new format internally - # We treat this as if it was {'http*://*': {key_or_domain: content}} - sensitive_values[key_or_domain] = content + sensitive_values = collect_sensitive_data_values(self.sensitive_data) # If there are no valid sensitive data entries, just return the original value if not sensitive_values: logger.warning('No valid entries found in sensitive_data dictionary') return value - # Replace all valid sensitive data values with their placeholder tags - for key, val in sensitive_values.items(): - value = value.replace(val, f'{key}') - - return value + return redact_sensitive_string(value, sensitive_values) if isinstance(message.content, str): message.content = replace_sensitive(message.content) diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index a7209378f..dbec9a534 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -27,6 +27,7 @@ from browser_use.filesystem.file_system import FileSystemState from browser_use.llm.base import BaseChatModel from browser_use.tokens.views import UsageSummary from browser_use.tools.registry.views import ActionModel +from browser_use.utils import collect_sensitive_data_values, redact_sensitive_string logger = logging.getLogger(__name__) @@ -512,29 +513,13 @@ class AgentHistory(BaseModel): if not sensitive_data: return value - # Collect all sensitive values, immediately converting old format to new format - sensitive_values: dict[str, str] = {} - - # Process all sensitive data entries - for key_or_domain, content in sensitive_data.items(): - if isinstance(content, dict): - # Already in new format: {domain: {key: value}} - for key, val in content.items(): - if val: # Skip empty values - sensitive_values[key] = val - elif content: # Old format: {key: value} - convert to new format internally - # We treat this as if it was {'http*://*': {key_or_domain: content}} - sensitive_values[key_or_domain] = content + sensitive_values = collect_sensitive_data_values(sensitive_data) # If there are no valid sensitive data entries, just return the original value if not sensitive_values: return value - # Replace all valid sensitive data values with their placeholder tags - for key, val in sensitive_values.items(): - value = value.replace(val, f'{key}') - - return value + return redact_sensitive_string(value, sensitive_values) def _filter_sensitive_data_from_dict( self, data: dict[str, Any], sensitive_data: dict[str, str | dict[str, str]] | None diff --git a/browser_use/utils.py b/browser_use/utils.py index 5661c9f34..a949aa77d 100644 --- a/browser_use/utils.py +++ b/browser_use/utils.py @@ -31,6 +31,30 @@ _openai_bad_request_error: type | None = None _groq_bad_request_error: type | None = None +def collect_sensitive_data_values(sensitive_data: dict[str, str | dict[str, str]] | None) -> dict[str, str]: + """Flatten legacy and domain-scoped sensitive data into placeholder -> value mappings.""" + if not sensitive_data: + return {} + + sensitive_values: dict[str, str] = {} + for key_or_domain, content in sensitive_data.items(): + if isinstance(content, dict): + for key, val in content.items(): + if val: + sensitive_values[key] = val + elif content: + sensitive_values[key_or_domain] = content + + return sensitive_values + + +def redact_sensitive_string(value: str, sensitive_values: dict[str, str]) -> str: + """Replace sensitive values with placeholders, longest matches first to avoid partial leaks.""" + for key, secret in sorted(sensitive_values.items(), key=lambda item: len(item[1]), reverse=True): + value = value.replace(secret, f'{key}') + return value + + def _get_openai_bad_request_error() -> type | None: """Lazy loader for OpenAI BadRequestError.""" global _openai_bad_request_error