From bdc60097aa9e77c50c5175dd9182af177ad51fa5 Mon Sep 17 00:00:00 2001 From: yrk <12787191+yrk15994109427@user.noreply.gitee.com> Date: Tue, 13 May 2025 17:31:28 +0800 Subject: [PATCH 01/18] Add ModelScope example and documentation --- .env.example | 1 + README.md | 1 + docs/customize/supported-models.mdx | 30 +++++++++++++++++++++ docs/development/local-setup.mdx | 1 + examples/models/modelscope_example.py | 39 +++++++++++++++++++++++++++ 5 files changed, 72 insertions(+) create mode 100644 examples/models/modelscope_example.py diff --git a/.env.example b/.env.example index 1128ae2e0..add5f7eda 100644 --- a/.env.example +++ b/.env.example @@ -6,6 +6,7 @@ GEMINI_API_KEY= DEEPSEEK_API_KEY= GROK_API_KEY= NOVITA_API_KEY= +MODELSCOPE_API_KEY= # Set to false to disable anonymized telemetry ANONYMIZED_TELEMETRY=true diff --git a/README.md b/README.md index ce72d3c61..04e160065 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ GEMINI_API_KEY= DEEPSEEK_API_KEY= GROK_API_KEY= NOVITA_API_KEY= +MODELSCOPE_API_KEY= ``` For other settings, models, and more, check out the [documentation šŸ“•](https://docs.browser-use.com). diff --git a/docs/customize/supported-models.mdx b/docs/customize/supported-models.mdx index 67cc92e23..70c258d19 100644 --- a/docs/customize/supported-models.mdx +++ b/docs/customize/supported-models.mdx @@ -283,6 +283,36 @@ Required environment variables: GROK_API_KEY= ``` +### ModelScope +[ModelScope](https://modelscope.cn/docs/model-service/API-Inference/intro) is an LLM API provider that offers a wide range of models. Note: choose a model that supports function calling. + +```python +from langchain_openai import ChatOpenAI +from browser_use import Agent +from pydantic import SecretStr +from dotenv import load_dotenv +import os + +load_dotenv() +api_key = os.getenv("MODELSCOPE_API_KEY") + +# Initialize the model +llm = ChatOpenAI(base_url='https://api-inference.modelscope.cn/v1/', model='Qwen/Qwen2.5-VL-72B-Instruct', api_key=SecretStr(api_key)) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm, + use_vision=False +) +``` + +Required environment variables: + +```bash .env +MODELSCOPE_API_KEY= +``` + ## Coming soon (We are working on it) - Groq diff --git a/docs/development/local-setup.mdx b/docs/development/local-setup.mdx index b8566d6c5..84e65a459 100644 --- a/docs/development/local-setup.mdx +++ b/docs/development/local-setup.mdx @@ -53,6 +53,7 @@ GEMINI_API_KEY= DEEPSEEK_API_KEY= GROK_API_KEY= NOVITA_API_KEY= +MODELSCOPE_API_KEY= ``` diff --git a/examples/models/modelscope_example.py b/examples/models/modelscope_example.py new file mode 100644 index 000000000..77fb13bd0 --- /dev/null +++ b/examples/models/modelscope_example.py @@ -0,0 +1,39 @@ +""" +Simple try of the agent. + +@dev You need to add MODELSCOPE_API_KEY to your environment variables. +""" + +import asyncio +import os + +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI +from pydantic import SecretStr + +from browser_use import Agent + +# dotenv +load_dotenv() + +api_key = os.getenv('MODELSCOPE_API_KEY', '') +if not api_key: + raise ValueError('MODELSCOPE_API_KEY is not set') + + +async def run_search(): + agent = Agent( + task=('go to amazon.com, search for laptop'), + llm=ChatOpenAI( + base_url='https://api-inference.modelscope.cn/v1/', + model='Qwen/QwQ-32B-Preview', + api_key=SecretStr(api_key), + ), + use_vision=False, + ) + + await agent.run() + + +if __name__ == '__main__': + asyncio.run(run_search()) \ No newline at end of file From 88b54eb73d6ece11e669e0f901de8ab1eceb6712 Mon Sep 17 00:00:00 2001 From: yrk <12787191+yrk15994109427@user.noreply.gitee.com> Date: Wed, 14 May 2025 11:11:48 +0800 Subject: [PATCH 02/18] Update documentation --- docs/customize/supported-models.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/customize/supported-models.mdx b/docs/customize/supported-models.mdx index 70c258d19..c5780e6eb 100644 --- a/docs/customize/supported-models.mdx +++ b/docs/customize/supported-models.mdx @@ -284,8 +284,8 @@ GROK_API_KEY= ``` ### ModelScope -[ModelScope](https://modelscope.cn/docs/model-service/API-Inference/intro) is an LLM API provider that offers a wide range of models. Note: choose a model that supports function calling. - +[ModelScope](https://modelscope.cn/docs/model-service/API-Inference/intro) is an LLM API provider that offers a wide range of models. Note: choose a model that supports function calling. And you need to bind your Alibaba Cloud account before using the API. +For more details about ModelScope API, please refer to this link. ```python from langchain_openai import ChatOpenAI from browser_use import Agent From 366fc705a9ee1478b84fc6ed5e58e78c8be300d9 Mon Sep 17 00:00:00 2001 From: yrk111222 <2493404415@qq.com> Date: Wed, 14 May 2025 11:16:00 +0800 Subject: [PATCH 03/18] Update supported-models.mdx --- docs/customize/supported-models.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/customize/supported-models.mdx b/docs/customize/supported-models.mdx index c5780e6eb..ec6b69c48 100644 --- a/docs/customize/supported-models.mdx +++ b/docs/customize/supported-models.mdx @@ -303,7 +303,7 @@ llm = ChatOpenAI(base_url='https://api-inference.modelscope.cn/v1/', model='Qwen agent = Agent( task="Your task here", llm=llm, - use_vision=False + use_vision=True ) ``` From 3b392581d849ee85858dbedbe6b9244c76f29191 Mon Sep 17 00:00:00 2001 From: yrk <12787191+yrk15994109427@user.noreply.gitee.com> Date: Wed, 14 May 2025 11:22:33 +0800 Subject: [PATCH 04/18] modify some details --- examples/models/modelscope_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/modelscope_example.py b/examples/models/modelscope_example.py index 77fb13bd0..6af7011a7 100644 --- a/examples/models/modelscope_example.py +++ b/examples/models/modelscope_example.py @@ -36,4 +36,4 @@ async def run_search(): if __name__ == '__main__': - asyncio.run(run_search()) \ No newline at end of file + asyncio.run(run_search()) From 8406528c0855ae7f121cc24661720cf9530367f3 Mon Sep 17 00:00:00 2001 From: quanglm Date: Tue, 2 Sep 2025 12:34:17 +0700 Subject: [PATCH 05/18] fix: MCP Server extract_content always returning 'No content extracted' --- browser_use/mcp/server.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/browser_use/mcp/server.py b/browser_use/mcp/server.py index 4debd23aa..46649c94d 100644 --- a/browser_use/mcp/server.py +++ b/browser_use/mcp/server.py @@ -722,10 +722,15 @@ class BrowserUseServer: ExtractAction = create_model( 'ExtractAction', __base__=ActionModel, - extract_structured_data=(dict[str, Any], {'query': query, 'extract_links': extract_links}), + extract_structured_data=dict[str, Any], ) - action = ExtractAction() + # Use model_validate because Pyright do not understand the dynamic model + action = ExtractAction.model_validate( + { + 'extract_structured_data': {'query': query, 'extract_links': extract_links}, + } + ) action_result = await self.tools.act( action=action, browser_session=self.browser_session, From 491df67c3e628204841873c841717e547b38a2af Mon Sep 17 00:00:00 2001 From: lienminhquang <40737537+lienminhquang@users.noreply.github.com> Date: Wed, 3 Sep 2025 09:30:30 +0700 Subject: [PATCH 06/18] fix: typo Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com> --- browser_use/mcp/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/mcp/server.py b/browser_use/mcp/server.py index 6ecc4fb82..249311e9e 100644 --- a/browser_use/mcp/server.py +++ b/browser_use/mcp/server.py @@ -725,7 +725,7 @@ class BrowserUseServer: extract_structured_data=dict[str, Any], ) - # Use model_validate because Pyright do not understand the dynamic model + # Use model_validate because Pyright does not understand the dynamic model action = ExtractAction.model_validate( { 'extract_structured_data': {'query': query, 'extract_links': extract_links}, From 7cb955c21443f1bb8120aad6ae6b8e2776800c6e Mon Sep 17 00:00:00 2001 From: yrk <2493404415@qq.com> Date: Mon, 8 Sep 2025 14:41:41 +0800 Subject: [PATCH 07/18] Modify sample code --- examples/models/modelscope_example.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/models/modelscope_example.py b/examples/models/modelscope_example.py index 6af7011a7..97d3a8e13 100644 --- a/examples/models/modelscope_example.py +++ b/examples/models/modelscope_example.py @@ -8,8 +8,7 @@ import asyncio import os from dotenv import load_dotenv -from langchain_openai import ChatOpenAI -from pydantic import SecretStr +from browser_use import Agent, ChatOpenAI from browser_use import Agent @@ -23,11 +22,12 @@ if not api_key: async def run_search(): agent = Agent( - task=('go to amazon.com, search for laptop'), + # task=('go to amazon.com, search for laptop'), + task=('go to google, search for modelscope'), llm=ChatOpenAI( base_url='https://api-inference.modelscope.cn/v1/', - model='Qwen/QwQ-32B-Preview', - api_key=SecretStr(api_key), + model='Qwen/Qwen2.5-VL-72B-Instruct', + api_key=api_key ), use_vision=False, ) From 1476d5b1d80883414ea448007b33134bb2ee6f14 Mon Sep 17 00:00:00 2001 From: yrk <2493404415@qq.com> Date: Mon, 8 Sep 2025 15:09:20 +0800 Subject: [PATCH 08/18] add ModelScope in .mdx docs --- docs/customize/agent/supported-models.mdx | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/customize/agent/supported-models.mdx b/docs/customize/agent/supported-models.mdx index 499a14976..5b5883189 100644 --- a/docs/customize/agent/supported-models.mdx +++ b/docs/customize/agent/supported-models.mdx @@ -243,6 +243,33 @@ Required environment variables: ALIBABA_CLOUD= ``` +## ModelScope [example](https://github.com/browser-use/browser-use/blob/main/examples/models/modelscope_example.py) + +```python +from browser_use import Agent, ChatOpenAI +from dotenv import load_dotenv +import os + +load_dotenv() + +# Get API key from https://www.modelscope.cn/docs/model-service/API-Inference/intro +api_key = os.getenv('MODELSCOPE_API_KEY') +base_url = 'https://api-inference.modelscope.cn/v1/' + +llm = ChatOpenAI(model='Qwen/Qwen2.5-VL-72B-Instruct', api_key=api_key, base_url=base_url) + +agent = Agent( + task="Your task here", + llm=llm, + use_vision=True +) +``` + +Required environment variables: + +```bash .env +MODELSCOPE_API_KEY= +``` ## Other models (DeepSeek, Novita, X...) From 4e1386fe3c5ba4c53efd38b19dfa2467a1bc7499 Mon Sep 17 00:00:00 2001 From: yrk <2493404415@qq.com> Date: Mon, 8 Sep 2025 15:16:42 +0800 Subject: [PATCH 09/18] Lint --- examples/models/modelscope_example.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/examples/models/modelscope_example.py b/examples/models/modelscope_example.py index 97d3a8e13..2be79cea4 100644 --- a/examples/models/modelscope_example.py +++ b/examples/models/modelscope_example.py @@ -8,9 +8,8 @@ import asyncio import os from dotenv import load_dotenv -from browser_use import Agent, ChatOpenAI -from browser_use import Agent +from browser_use import Agent, ChatOpenAI # dotenv load_dotenv() @@ -24,11 +23,7 @@ async def run_search(): agent = Agent( # task=('go to amazon.com, search for laptop'), task=('go to google, search for modelscope'), - llm=ChatOpenAI( - base_url='https://api-inference.modelscope.cn/v1/', - model='Qwen/Qwen2.5-VL-72B-Instruct', - api_key=api_key - ), + llm=ChatOpenAI(base_url='https://api-inference.modelscope.cn/v1/', model='Qwen/Qwen2.5-VL-72B-Instruct', api_key=api_key), use_vision=False, ) From 54dae0f15ac14b298d1e1ab7d20b27e218f504e5 Mon Sep 17 00:00:00 2001 From: abelsummation Date: Sat, 13 Sep 2025 19:32:46 +0400 Subject: [PATCH 10/18] Add parallel multi-agent example - Demonstrates parallel browser agent execution with isolated browser instances - Each agent gets its own browser session with custom temp directory - Proper browser session management and cleanup - Configurable task at top of file for easy testing - Shows how to run multiple agents simultaneously on different subtasks --- examples/custom-functions/parallel_agents.py | 312 +++++++++++++++++++ 1 file changed, 312 insertions(+) create mode 100644 examples/custom-functions/parallel_agents.py diff --git a/examples/custom-functions/parallel_agents.py b/examples/custom-functions/parallel_agents.py new file mode 100644 index 000000000..6a5958ea3 --- /dev/null +++ b/examples/custom-functions/parallel_agents.py @@ -0,0 +1,312 @@ +""" +Simple parallel multi-agent example. + +This launches multiple agents in parallel to work on different tasks simultaneously. +No complex orchestrator - just direct parallel execution. + +@file purpose: Demonstrates parallel multi-agent execution using asyncio +""" + +import asyncio +import os +import sys +from typing import List + +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from dotenv import load_dotenv +load_dotenv() + +from browser_use import Agent +from browser_use.llm.google import ChatGoogle + +# ============================================================================ +# šŸ”§ SIMPLE CONFIGURATION - CHANGE THIS TO YOUR DESIRED TASK +# ============================================================================ + +MAIN_TASK = "find age of ronaldo and messi" + +# Simple test - let's start with just one person to see what happens +# MAIN_TASK = "find age of elon musk" + +# ============================================================================ + + +async def create_subtasks(main_task: str, llm) -> list[str]: + """ + Use LLM to break down main task into logical subtasks + + Real examples of how this works: + + Input: "what is the revenue of nvidia, microsoft, tesla" + Output: [ + "Find Nvidia's current revenue and financial data", + "Find Microsoft's current revenue and financial data", + "Find Tesla's current revenue and financial data" + ] + + Input: "what are ages of musk, altman, bezos, gates" + Output: [ + "Find Elon Musk's age and birth date", + "Find Sam Altman's age and birth date", + "Find Jeff Bezos's age and birth date", + "Find Bill Gates's age and birth date" + ] + + Input: "what is the population of tokyo, new york, london, paris" + Output: [ + "Find Tokyo's current population", + "Find New York's current population", + "Find London's current population", + "Find Paris's current population" + ] + + Input: "name top 10 yc companies by revenue" + Output: [ + "Research Y Combinator's top companies by revenue", + "Find revenue data for top YC companies", + "Compile list of top 10 YC companies by revenue" + ] + """ + + prompt = f""" + Break down this main task into individual, separate subtasks where each subtask focuses on ONLY ONE specific person, company, or item: + + Main task: {main_task} + + RULES: + - Each subtask must focus on ONLY ONE person/company/item + - Do NOT combine multiple people/companies/items in one subtask + - Each subtask should be completely independent + - If the main task mentions multiple items, create one subtask per item + + Return only the subtasks, one per line, without numbering or bullets. + Each line should focus on exactly ONE person/company/item. + """ + + try: + # Use the correct method for ChatGoogle + response = await llm.ainvoke(prompt) + + # Debug: Print the response type and content + print(f"DEBUG: Response type: {type(response)}") + print(f"DEBUG: Response content: {response}") + + # Handle different response types - ChatGoogle returns string content + if hasattr(response, 'content'): + content = response.content + elif isinstance(response, str): + content = response + elif hasattr(response, 'text'): + content = response.text + else: + # Convert to string if it's some other type + content = str(response) + + # Split by newlines and clean up + subtasks = [task.strip() for task in content.strip().split('\n') if task.strip()] + + # Remove any numbering or bullets that the LLM might add + cleaned_subtasks = [] + for task in subtasks: + # Remove common prefixes like "1. ", "- ", "* ", etc. + cleaned = task.lstrip('0123456789.-* ') + if cleaned: + cleaned_subtasks.append(cleaned) + + return cleaned_subtasks if cleaned_subtasks else simple_split_task(main_task) + except Exception as e: + print(f"Error creating subtasks: {e}") + # Fallback to simple split + return simple_split_task(main_task) + + +def simple_split_task(main_task: str) -> list[str]: + """Simple fallback: split task by common separators""" + task_lower = main_task.lower() + + # Try to split by common separators + if " and " in task_lower: + parts = main_task.split(" and ") + return [part.strip() for part in parts if part.strip()] + elif ", " in main_task: + parts = main_task.split(", ") + return [part.strip() for part in parts if part.strip()] + elif "," in main_task: + parts = main_task.split(",") + return [part.strip() for part in parts if part.strip()] + + # If no separators found, return the original task + return [main_task] + + +async def run_single_agent(task: str, llm, agent_id: int) -> tuple[int, str]: + """Run a single agent and return its result""" + print(f"šŸš€ Agent {agent_id} starting: {task}") + print(f" šŸ“ This agent will focus ONLY on: {task}") + print(f" 🌐 Creating isolated browser instance for agent {agent_id}") + + try: + # Create agent with its own browser session (separate browser instance) + from browser_use.browser import BrowserSession + from browser_use.browser.profile import BrowserProfile + import tempfile + + # Create a unique temp directory for this agent's browser data + temp_dir = tempfile.mkdtemp(prefix=f"browser_agent_{agent_id}_") + + # Create browser profile with custom user data directory and single tab focus + profile = BrowserProfile() + profile.user_data_dir = temp_dir + profile.headless = False # Set to True if you want headless mode + profile.keep_alive = False # Don't keep browser alive after task + + # Add custom args to prevent new tabs and popups + profile.args = [ + '--disable-popup-blocking', + '--disable-extensions', + '--disable-plugins', + '--disable-images', # Faster loading + '--no-first-run', + '--disable-default-apps', + '--disable-background-timer-throttling', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + ] + + # Create a new browser session for each agent with the custom profile + browser_session = BrowserSession(browser_profile=profile) + + # Debug: Check initial tab count + try: + await browser_session.start() + initial_tabs = await browser_session._cdp_get_all_pages() + print(f" šŸ“Š Agent {agent_id} initial tab count: {len(initial_tabs)}") + except Exception as e: + print(f" āš ļø Could not check initial tabs for agent {agent_id}: {e}") + + # Create agent with the dedicated browser session and disable auto URL detection + agent = Agent(task=task, llm=llm, browser_session=browser_session, preload=False) + + # Run the agent with timeout to prevent hanging + try: + result = await asyncio.wait_for(agent.run(), timeout=300) # 5 minute timeout + except asyncio.TimeoutError: + print(f"ā° Agent {agent_id} timed out after 5 minutes") + result = "Task timed out" + + # Debug: Check final tab count + try: + final_tabs = await browser_session._cdp_get_all_pages() + print(f" šŸ“Š Agent {agent_id} final tab count: {len(final_tabs)}") + for i, tab in enumerate(final_tabs): + print(f" Tab {i+1}: {tab.get('url', 'unknown')[:50]}...") + except Exception as e: + print(f" āš ļø Could not check final tabs for agent {agent_id}: {e}") + + # Extract clean result from the agent history + clean_result = extract_clean_result(result) + + # Close the browser session for this agent + try: + await browser_session.kill() + except Exception as e: + print(f"āš ļø Warning: Error closing browser for agent {agent_id}: {e}") + + print(f"āœ… Agent {agent_id} completed and browser closed: {task}") + + return agent_id, clean_result + + except Exception as e: + error_msg = f"Agent {agent_id} failed: {str(e)}" + print(f"āŒ {error_msg}") + return agent_id, error_msg + + +def extract_clean_result(agent_result) -> str: + """Extract clean result from agent history""" + try: + # Get the last result from the agent history + if hasattr(agent_result, 'all_results') and agent_result.all_results: + last_result = agent_result.all_results[-1] + if hasattr(last_result, 'extracted_content') and last_result.extracted_content: + return last_result.extracted_content + + # Fallback to string representation + return str(agent_result) + except Exception: + return "Result extraction failed" + + +async def run_parallel_agents(): + """Run multiple agents in parallel on different tasks""" + + # Use Gemini 1.5 Flash + llm = ChatGoogle(model="gemini-1.5-flash") + + # Main task to break down - use the simple configuration + main_task = MAIN_TASK + + print(f"šŸŽÆ Main task: {main_task}") + print("🧠 Creating subtasks using LLM...") + + # Create subtasks using LLM + subtasks = await create_subtasks(main_task, llm) + + print(f"šŸ“‹ Created {len(subtasks)} subtasks:") + for i, task in enumerate(subtasks, 1): + print(f" {i}. {task}") + + print(f"\nšŸ”„ Starting {len(subtasks)} agents in parallel...") + print(f"šŸ” Each agent will get its own browser instance with exactly ONE tab") + print(f"šŸ“Š Expected: {len(subtasks)} browser instances, {len(subtasks)} tabs total") + + # Create tasks for parallel execution + agent_tasks = [ + run_single_agent(task, llm, i+1) + for i, task in enumerate(subtasks) + ] + + # Run all agents in parallel using asyncio.gather + results = await asyncio.gather(*agent_tasks) + + # Print results + print("\n" + "="*60) + print("šŸ“Š PARALLEL EXECUTION RESULTS") + print("="*60) + + for agent_id, result in results: + print(f"\nšŸ¤– Agent {agent_id} result:") + print(f"Task: {subtasks[agent_id-1]}") + print(f"Result: {result}") + print("-" * 50) + + print(f"\nšŸŽ‰ All {len(subtasks)} parallel agents completed!") + + +def main(): + """Main function to run parallel agents""" + # Check if Google API key is available + api_key = os.getenv('GOOGLE_API_KEY') + if not api_key: + print('āŒ Error: GOOGLE_API_KEY environment variable not set') + print('Please set your Google API key to use parallel agents') + print('You can set it with: export GOOGLE_API_KEY="your-key-here"') + sys.exit(1) + + # Check if API key looks valid (Google API keys are typically 39 characters) + if len(api_key) < 20: + print(f'āš ļø Warning: GOOGLE_API_KEY seems too short ({len(api_key)} characters)') + print('Google API keys are typically 39 characters long') + print('Continuing anyway, but this might cause authentication issues...') + + print('šŸš€ Starting parallel multi-agent example...') + print(f'šŸ“ Task: {MAIN_TASK}') + print('This will dynamically create agents based on task complexity') + print('-' * 60) + + asyncio.run(run_parallel_agents()) + + +if __name__ == "__main__": + main() \ No newline at end of file From 7173d196d2b86d75142e94ba4bbc0fd0433ab8a1 Mon Sep 17 00:00:00 2001 From: naaa760 Date: Sun, 14 Sep 2025 15:15:07 +0530 Subject: [PATCH 11/18] fix: replace hyphens with underscores in EventBus name generation Fixes random EventBus error when adding new tasks by ensuring agent ID suffix is always a valid Python identifier. --- browser_use/agent/service.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 2a46efc12..051a016e8 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -604,7 +604,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): self._message_manager.add_new_task(new_task) # Mark as follow-up task and recreate eventbus (gets shut down after each run) self.state.follow_up_task = True - self.eventbus = EventBus(name=f'Agent_{str(self.id)[-self.state.n_steps :]}') + + agent_id_suffix = str(self.id)[-self.state.n_steps :].replace('-', '_') + self.eventbus = EventBus(name=f'Agent_{agent_id_suffix}') # Re-register cloud sync handler if it exists (if not disabled) if hasattr(self, 'cloud_sync') and self.cloud_sync and self.enable_cloud_sync: From bfbe1a8e8361d3a58d29f03328f78fb69f8190bb Mon Sep 17 00:00:00 2001 From: naaa760 Date: Sun, 14 Sep 2025 15:35:24 +0530 Subject: [PATCH 12/18] EventBus names now generate valid Python identifiers, preventing the AssertionError completely --- browser_use/agent/service.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 051a016e8..ba41b3287 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -604,8 +604,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): self._message_manager.add_new_task(new_task) # Mark as follow-up task and recreate eventbus (gets shut down after each run) self.state.follow_up_task = True - - agent_id_suffix = str(self.id)[-self.state.n_steps :].replace('-', '_') + agent_id_suffix = str(self.id)[-4:].replace('-', '_') + if agent_id_suffix and agent_id_suffix[0].isdigit(): + agent_id_suffix = 'a' + agent_id_suffix self.eventbus = EventBus(name=f'Agent_{agent_id_suffix}') # Re-register cloud sync handler if it exists (if not disabled) From 166ea02173de455b97392f14777a831b2a7544a1 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 21 Sep 2025 03:31:55 +0000 Subject: [PATCH 13/18] Fix: Check for None in override_system_message Co-authored-by: mailmertunsal --- browser_use/agent/prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py index 8c7aa7348..1b0ad76f8 100644 --- a/browser_use/agent/prompts.py +++ b/browser_use/agent/prompts.py @@ -28,7 +28,7 @@ class SystemPrompt: self.use_thinking = use_thinking self.flash_mode = flash_mode prompt = '' - if override_system_message: + if override_system_message is not None: prompt = override_system_message else: self._load_prompt_template() From 03608b62a25240fa812aed6e68a3b1be9bece19a Mon Sep 17 00:00:00 2001 From: Chris Schnabl Date: Sun, 21 Sep 2025 21:38:42 -0700 Subject: [PATCH 14/18] Remove unused paramter --- browser_use/agent/service.py | 4 +--- browser_use/browser/events.py | 1 - browser_use/browser/session.py | 2 -- browser_use/mcp/server.py | 2 +- tests/ci/test_browser_event_ClickElementEvent.py | 14 +++++++------- .../test_browser_event_GetDropdownOptionsEvent.py | 14 +++++++------- ...ser_event_GetDropdownOptionsEvent_aria_menus.py | 6 +++--- tests/ci/test_browser_event_NavigateToUrlEvent.py | 2 +- tests/ci/test_browser_session_element_cache.py | 10 +++++----- tests/ci/test_browser_watchdog_screenshots.py | 6 +++--- tests/ci/test_tools.py | 4 ++-- tests/scripts/debug_iframe_scrolling.py | 2 +- 12 files changed, 31 insertions(+), 36 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index d96b67d7e..fe00573bb 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -679,7 +679,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Always take screenshots for all steps self.logger.debug('šŸ“ø Requesting browser state with include_screenshot=True') browser_state_summary = await self.browser_session.get_browser_state_summary( - cache_clickable_elements_hashes=True, include_screenshot=True, # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway) include_recent_events=self.include_recent_events, ) @@ -1660,7 +1659,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): # This prevents stale element detection but doesn't refresh before execution if action.get_index() is not None and i != 0: new_browser_state_summary = await self.browser_session.get_browser_state_summary( - cache_clickable_elements_hashes=False, include_screenshot=False, ) new_selector_map = new_browser_state_summary.dom_state.selector_map @@ -1887,7 +1885,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): """Execute a single step from history with element validation""" assert self.browser_session is not None, 'BrowserSession is not set up' state = await self.browser_session.get_browser_state_summary( - cache_clickable_elements_hashes=False, include_screenshot=False + include_screenshot=False ) if not state or not history_item.model_output: raise ValueError('Invalid state or model output') diff --git a/browser_use/browser/events.py b/browser_use/browser/events.py index 15f388bed..a89d9412c 100644 --- a/browser_use/browser/events.py +++ b/browser_use/browser/events.py @@ -190,7 +190,6 @@ class BrowserStateRequestEvent(BaseEvent[BrowserStateSummary]): include_dom: bool = True include_screenshot: bool = True - cache_clickable_elements_hashes: bool = True include_recent_events: bool = False event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStateRequestEvent', 30.0) # seconds diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 83e046b58..6713947fa 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -1045,7 +1045,6 @@ class BrowserSession(BaseModel): @observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_summary') async def get_browser_state_summary( self, - cache_clickable_elements_hashes: bool = True, include_screenshot: bool = True, cached: bool = False, include_recent_events: bool = False, @@ -1072,7 +1071,6 @@ class BrowserSession(BaseModel): BrowserStateRequestEvent( include_dom=True, include_screenshot=include_screenshot, - cache_clickable_elements_hashes=cache_clickable_elements_hashes, include_recent_events=include_recent_events, ) ), diff --git a/browser_use/mcp/server.py b/browser_use/mcp/server.py index d9cec1e54..55dda4579 100644 --- a/browser_use/mcp/server.py +++ b/browser_use/mcp/server.py @@ -768,7 +768,7 @@ class BrowserUseServer: if not self.browser_session: return 'Error: No browser session active' - state = await self.browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + state = await self.browser_session.get_browser_state_summary() result = { 'url': state.url, diff --git a/tests/ci/test_browser_event_ClickElementEvent.py b/tests/ci/test_browser_event_ClickElementEvent.py index 6a8b62684..6c2fa03cf 100644 --- a/tests/ci/test_browser_event_ClickElementEvent.py +++ b/tests/ci/test_browser_event_ClickElementEvent.py @@ -143,7 +143,7 @@ class TestClickElementEvent: await asyncio.sleep(0.5) # Give page time to load # Initialize the DOM state to populate the selector map - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map selector_map = await browser_session.get_selector_map() @@ -406,7 +406,7 @@ class TestClickElementEvent: await asyncio.sleep(0.5) # Get the clickable elements - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() selector_map = await browser_session.get_selector_map() # Find the inline element @@ -488,7 +488,7 @@ class TestClickElementEvent: await asyncio.sleep(0.5) # Get the clickable elements - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() selector_map = await browser_session.get_selector_map() # Find the block element inside inline @@ -576,7 +576,7 @@ class TestClickElementEvent: await asyncio.sleep(0.5) # Get the clickable elements - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() selector_map = await browser_session.get_selector_map() # Find the target element @@ -636,7 +636,7 @@ class TestClickElementEvent: await asyncio.sleep(0.5) # Get the clickable elements - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() selector_map = await browser_session.get_selector_map() # Find the file input @@ -699,7 +699,7 @@ class TestClickElementEvent: await asyncio.sleep(0.5) # Get the clickable elements - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() selector_map = await browser_session.get_selector_map() # Find the select element @@ -1098,7 +1098,7 @@ class TestClickElementEvent: await asyncio.sleep(0.5) # Initialize the DOM state to populate the selector map - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map selector_map = await browser_session.get_selector_map() diff --git a/tests/ci/test_browser_event_GetDropdownOptionsEvent.py b/tests/ci/test_browser_event_GetDropdownOptionsEvent.py index d582bee96..3d3193e25 100644 --- a/tests/ci/test_browser_event_GetDropdownOptionsEvent.py +++ b/tests/ci/test_browser_event_GetDropdownOptionsEvent.py @@ -286,7 +286,7 @@ class TestGetDropdownOptionsEvent: await tools.act(GoToUrlActionModel(**goto_action), browser_session) # Initialize the DOM state to populate the selector map - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map and find the select element selector_map = await browser_session.get_selector_map() @@ -344,7 +344,7 @@ class TestGetDropdownOptionsEvent: await tools.act(GoToUrlActionModel(**goto_action), browser_session) # Initialize the DOM state - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map and find the ARIA menu selector_map = await browser_session.get_selector_map() @@ -406,7 +406,7 @@ class TestGetDropdownOptionsEvent: await tools.act(GoToUrlActionModel(**goto_action), browser_session) # Initialize the DOM state - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map and find the custom dropdown selector_map = await browser_session.get_selector_map() @@ -495,7 +495,7 @@ class TestSelectDropdownOptionEvent: await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) # Initialize the DOM state - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map and find the select element selector_map = await browser_session.get_selector_map() @@ -543,7 +543,7 @@ class TestSelectDropdownOptionEvent: await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) # Initialize the DOM state - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map and find the ARIA menu selector_map = await browser_session.get_selector_map() @@ -595,7 +595,7 @@ class TestSelectDropdownOptionEvent: await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) # Initialize the DOM state - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map and find the custom dropdown selector_map = await browser_session.get_selector_map() @@ -643,7 +643,7 @@ class TestSelectDropdownOptionEvent: await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) # Initialize the DOM state - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map and find the select element selector_map = await browser_session.get_selector_map() diff --git a/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py b/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py index eddf4cb1a..54a1a07a1 100644 --- a/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py +++ b/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py @@ -165,7 +165,7 @@ class TestARIAMenuDropdown: await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) # Initialize the DOM state to populate the selector map - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map selector_map = await browser_session.get_selector_map() @@ -232,7 +232,7 @@ class TestARIAMenuDropdown: await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) # Initialize the DOM state to populate the selector map - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map selector_map = await browser_session.get_selector_map() @@ -302,7 +302,7 @@ class TestARIAMenuDropdown: await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) # Initialize the DOM state to populate the selector map - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map selector_map = await browser_session.get_selector_map() diff --git a/tests/ci/test_browser_event_NavigateToUrlEvent.py b/tests/ci/test_browser_event_NavigateToUrlEvent.py index 198c05701..a6a008d4c 100644 --- a/tests/ci/test_browser_event_NavigateToUrlEvent.py +++ b/tests/ci/test_browser_event_NavigateToUrlEvent.py @@ -97,7 +97,7 @@ class TestNavigateToUrlEvent: # Test that get_state_summary works try: - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() assert False, 'Expected throw error when navigating to non-existent page' except Exception as e: pass diff --git a/tests/ci/test_browser_session_element_cache.py b/tests/ci/test_browser_session_element_cache.py index 293ab5526..132b8c716 100644 --- a/tests/ci/test_browser_session_element_cache.py +++ b/tests/ci/test_browser_session_element_cache.py @@ -88,7 +88,7 @@ async def test_assumption_1_dom_processing_works(browser_session, httpserver): await event.event_result(raise_if_any=True, raise_if_none=False) # Trigger DOM processing - state = await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + state = await browser_session.get_browser_state_summary() print('DOM processing result:') print(f' - Elements found: {len(state.dom_state.selector_map)}') @@ -109,7 +109,7 @@ async def test_assumption_2_cached_selector_map_persists(browser_session, httpse await event.event_result(raise_if_any=True, raise_if_none=False) # Trigger DOM processing and cache - state = await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + state = await browser_session.get_browser_state_summary() initial_selector_map = dict(state.dom_state.selector_map) # Check if cached selector map is still available @@ -136,7 +136,7 @@ async def test_assumption_3_action_gets_same_selector_map(browser_session, tools await event.event_result(raise_if_any=True, raise_if_none=False) # Trigger DOM processing and cache - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + await browser_session.get_browser_state_summary() cached_selector_map = await browser_session.get_selector_map() print('Pre-action state:') @@ -174,7 +174,7 @@ async def test_assumption_4_click_action_specific_issue(browser_session, tools, await event.event_result(raise_if_any=True, raise_if_none=False) # Trigger DOM processing and cache - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + await browser_session.get_browser_state_summary() cached_selector_map = await browser_session.get_selector_map() print('Pre-click state:') @@ -224,7 +224,7 @@ async def test_assumption_5_multiple_get_selector_map_calls(browser_session, htt await event.event_result(raise_if_any=True, raise_if_none=False) # Trigger DOM processing and cache - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + await browser_session.get_browser_state_summary() # Call get_selector_map multiple times map1 = await browser_session.get_selector_map() diff --git a/tests/ci/test_browser_watchdog_screenshots.py b/tests/ci/test_browser_watchdog_screenshots.py index 001493bf1..ec8900c5a 100644 --- a/tests/ci/test_browser_watchdog_screenshots.py +++ b/tests/ci/test_browser_watchdog_screenshots.py @@ -104,7 +104,7 @@ class TestHeadlessScreenshots: await event.event_result(raise_if_any=True, raise_if_none=False) # Get state summary - state = await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + state = await browser_session.get_browser_state_summary() # Verify screenshot is included assert state.screenshot is not None @@ -143,7 +143,7 @@ class TestHeadlessScreenshots: # Browser should auto-create a new page on about:blank with animation # With AboutBlankWatchdog, about:blank pages now have animated content, so they should have screenshots - state = await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + state = await browser_session.get_browser_state_summary() assert state.screenshot is not None, 'Screenshot should not be None for animated about:blank pages' assert state.url == 'about:blank' or state.url.startswith('chrome://'), f'Expected empty page but got {state.url}' @@ -153,7 +153,7 @@ class TestHeadlessScreenshots: await event.event_result(raise_if_any=True, raise_if_none=False) # Get state with screenshot - state = await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=False) + state = await browser_session.get_browser_state_summary() # Should have a screenshot now assert state.screenshot is not None, 'Screenshot should not be None for real pages' assert isinstance(state.screenshot, str) diff --git a/tests/ci/test_tools.py b/tests/ci/test_tools.py index 228367619..bd4378b4e 100644 --- a/tests/ci/test_tools.py +++ b/tests/ci/test_tools.py @@ -413,7 +413,7 @@ class TestToolsIntegration: await asyncio.sleep(1.0) # Initialize the DOM state to populate the selector map - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Get the selector map selector_map = await browser_session.get_selector_map() @@ -540,7 +540,7 @@ class TestToolsIntegration: await asyncio.sleep(1.0) # populate the selector map with highlight indices - await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + await browser_session.get_browser_state_summary() # Now get the selector map which should contain our dropdown selector_map = await browser_session.get_selector_map() diff --git a/tests/scripts/debug_iframe_scrolling.py b/tests/scripts/debug_iframe_scrolling.py index e2914eac9..0debb6164 100644 --- a/tests/scripts/debug_iframe_scrolling.py +++ b/tests/scripts/debug_iframe_scrolling.py @@ -132,7 +132,7 @@ async def debug_iframe_scrolling(): print(f'\nšŸ“ø Capturing DOM state: {label}') state_event = browser_session.event_bus.dispatch( BrowserStateRequestEvent( - include_dom=True, include_screenshot=False, cache_clickable_elements_hashes=True, include_recent_events=False + include_dom=True, include_screenshot=False, include_recent_events=False ) ) browser_state = await state_event.event_result() From b3c52e8f7a2f9e33fc6a8342908f465e44bedeaa Mon Sep 17 00:00:00 2001 From: Smaran Nama Arunkumar Date: Mon, 22 Sep 2025 15:19:53 +0530 Subject: [PATCH 15/18] Changing Pydantic limit for Amazon Bedrock ARN in model The ARN of cross-region inference exceeds the limit of 100 characters, which is not allowing to make ChatAnthropicBedrock calls. The limit increase ensures we are able to use the inference endpoints with the ARN. --- browser_use/agent/cloud_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browser_use/agent/cloud_events.py b/browser_use/agent/cloud_events.py index 83ca52a66..ed7b3c4b3 100644 --- a/browser_use/agent/cloud_events.py +++ b/browser_use/agent/cloud_events.py @@ -188,7 +188,7 @@ class CreateAgentTaskEvent(BaseEvent): user_id: str = Field(max_length=255) # Added for authorization checks device_id: str | None = Field(None, max_length=255) # Device ID for auth lookup agent_session_id: str - llm_model: str = Field(max_length=100) # LLMModel enum value as string + llm_model: str = Field(max_length=200) # LLMModel enum value as string stopped: bool = False paused: bool = False task: str = Field(max_length=MAX_TASK_LENGTH) From 84b7eb07eee5af6b683c4bd759c2c4dfdea25385 Mon Sep 17 00:00:00 2001 From: Chris Schnabl Date: Mon, 22 Sep 2025 10:49:55 -0700 Subject: [PATCH 16/18] Run pre-commit hooks --- browser_use/agent/service.py | 4 +--- tests/scripts/debug_iframe_scrolling.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index fe00573bb..51fd5de1f 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -1884,9 +1884,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]: """Execute a single step from history with element validation""" assert self.browser_session is not None, 'BrowserSession is not set up' - state = await self.browser_session.get_browser_state_summary( - include_screenshot=False - ) + state = await self.browser_session.get_browser_state_summary(include_screenshot=False) if not state or not history_item.model_output: raise ValueError('Invalid state or model output') updated_actions = [] diff --git a/tests/scripts/debug_iframe_scrolling.py b/tests/scripts/debug_iframe_scrolling.py index 0debb6164..7b02fba66 100644 --- a/tests/scripts/debug_iframe_scrolling.py +++ b/tests/scripts/debug_iframe_scrolling.py @@ -131,9 +131,7 @@ async def debug_iframe_scrolling(): """Capture DOM state and return analysis""" print(f'\nšŸ“ø Capturing DOM state: {label}') state_event = browser_session.event_bus.dispatch( - BrowserStateRequestEvent( - include_dom=True, include_screenshot=False, include_recent_events=False - ) + BrowserStateRequestEvent(include_dom=True, include_screenshot=False, include_recent_events=False) ) browser_state = await state_event.event_result() From 0a924437d6e46c27442731c3300187ee8df82c9a Mon Sep 17 00:00:00 2001 From: mertunsall Date: Tue, 23 Sep 2025 02:52:38 +0200 Subject: [PATCH 17/18] improve the pdf viewer prompt --- browser_use/agent/prompts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py index 1b0ad76f8..42ef64926 100644 --- a/browser_use/agent/prompts.py +++ b/browser_use/agent/prompts.py @@ -265,7 +265,8 @@ class AgentMessagePrompt: # Check if current page is a PDF viewer and add appropriate message pdf_message = '' if self.browser_state.is_pdf_viewer: - pdf_message = 'PDF viewer cannot be rendered. In this page, DO NOT use the extract_structured_data action as PDF content cannot be rendered. Use the read_file action on the downloaded PDF in available_file_paths to read the full content.\n\n' + pdf_message = 'PDF viewer cannot be rendered. In this page, DO NOT use the extract_structured_data action as PDF content cannot be rendered. ' + pdf_message += 'Use the read_file action on the downloaded PDF in available_file_paths to read the full text content or scroll in the page to see images/figures if needed.\n\n' # Add recent events if available and requested recent_events_text = '' From 3c41727b55547737940ec578e37cdf319b4ca0c9 Mon Sep 17 00:00:00 2001 From: mertunsall Date: Tue, 23 Sep 2025 02:54:25 +0200 Subject: [PATCH 18/18] track session pdf urls and redownload if URL changes or if the file was left from previous session. dispatch navigation complete even when switching tabs. this ensure auto pdf downloads work increase max pdf pages to 20 so that it works better :) --- browser_use/browser/session.py | 11 +++++ .../browser/watchdogs/downloads_watchdog.py | 40 +++++++++++++------ browser_use/filesystem/file_system.py | 2 +- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 83e046b58..f0ddf4d15 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -821,6 +821,17 @@ class BrowserSession(BaseModel): self.agent_focus = await self.get_or_create_cdp_session(target_id=last_target_id, focus=True) raise + # Dispatch NavigationCompleteEvent when tab focus changes + # This ensures PDF detection and downloads work when switching tabs + if event.target_id and event.url: + self.logger.debug(f'šŸ”„ Dispatching NavigationCompleteEvent for tab switch to {event.url[:50]}...') + await self.event_bus.dispatch( + NavigationCompleteEvent( + target_id=event.target_id, + url=event.url, + ) + ) + # self.logger.debug('šŸ”„ AgentFocusChangedEvent handler completed successfully') async def on_FileDownloadedEvent(self, event: FileDownloadedEvent) -> None: diff --git a/browser_use/browser/watchdogs/downloads_watchdog.py b/browser_use/browser/watchdogs/downloads_watchdog.py index f31d27b81..bc6f0d4c6 100644 --- a/browser_use/browser/watchdogs/downloads_watchdog.py +++ b/browser_use/browser/watchdogs/downloads_watchdog.py @@ -56,6 +56,7 @@ class DownloadsWatchdog(BaseWatchdog): _cdp_event_tasks: set[asyncio.Task] = PrivateAttr(default_factory=set) # Track CDP event handler tasks _cdp_downloads_info: dict[str, dict[str, Any]] = PrivateAttr(default_factory=dict) # Map guid -> info _use_js_fetch_for_local: bool = PrivateAttr(default=False) # Guard JS fetch path for local regular downloads + _session_pdf_urls: dict[str, str] = PrivateAttr(default_factory=dict) # URL -> path for PDFs downloaded this session async def on_BrowserLaunchEvent(self, event: BrowserLaunchEvent) -> None: self.logger.debug(f'[DownloadsWatchdog] Received BrowserLaunchEvent, EventBus ID: {id(self.event_bus)}') @@ -123,6 +124,7 @@ class DownloadsWatchdog(BaseWatchdog): self._sessions_with_listeners.clear() self._active_downloads.clear() self._pdf_viewer_cache.clear() + self._session_pdf_urls.clear() async def on_NavigationCompleteEvent(self, event: NavigationCompleteEvent) -> None: """Check for PDFs after navigation completes.""" @@ -801,13 +803,26 @@ class DownloadsWatchdog(BaseWatchdog): self.logger.debug(f'[DownloadsWatchdog] Generated filename: {pdf_filename}') - # Check if already downloaded by looking in the downloads directory + # Check if already downloaded in this session + self.logger.debug(f'[DownloadsWatchdog] PDF_URL: {pdf_url}, session_pdf_urls: {self._session_pdf_urls}') + if pdf_url in self._session_pdf_urls: + existing_path = self._session_pdf_urls[pdf_url] + self.logger.debug(f'[DownloadsWatchdog] PDF already downloaded in session: {existing_path}') + return existing_path + + # Generate unique filename if file exists from previous run downloads_dir = str(self.browser_session.browser_profile.downloads_path) - if os.path.exists(downloads_dir): - existing_files = os.listdir(downloads_dir) - if pdf_filename in existing_files: - self.logger.debug(f'[DownloadsWatchdog] PDF already downloaded: {pdf_filename}') - return None + os.makedirs(downloads_dir, exist_ok=True) + final_filename = pdf_filename + existing_files = os.listdir(downloads_dir) + if pdf_filename in existing_files: + # Generate unique name with (1), (2), etc. + base, ext = os.path.splitext(pdf_filename) + counter = 1 + while f'{base} ({counter}){ext}' in existing_files: + counter += 1 + final_filename = f'{base} ({counter}){ext}' + self.logger.debug(f'[DownloadsWatchdog] File exists, using: {final_filename}') self.logger.debug(f'[DownloadsWatchdog] Starting PDF download from: {pdf_url[:100]}...') @@ -858,12 +873,10 @@ class DownloadsWatchdog(BaseWatchdog): download_result = result.get('result', {}).get('value', {}) if download_result and download_result.get('data') and len(download_result['data']) > 0: - # Ensure unique filename - downloads_dir = str(self.browser_session.browser_profile.downloads_path) # Ensure downloads directory exists + downloads_dir = str(self.browser_session.browser_profile.downloads_path) os.makedirs(downloads_dir, exist_ok=True) - unique_filename = await self._get_unique_filename(downloads_dir, pdf_filename) - download_path = os.path.join(downloads_dir, unique_filename) + download_path = os.path.join(downloads_dir, final_filename) # Save the PDF asynchronously async with await anyio.open_file(download_path, 'wb') as f: @@ -886,13 +899,16 @@ class DownloadsWatchdog(BaseWatchdog): f'[DownloadsWatchdog] āœ… Auto-downloaded PDF ({cache_status}, {response_size:,} bytes): {download_path}' ) + # Store URL->path mapping for this session + self._session_pdf_urls[pdf_url] = download_path + # Emit file downloaded event - self.logger.debug(f'[DownloadsWatchdog] Dispatching FileDownloadedEvent for {unique_filename}') + self.logger.debug(f'[DownloadsWatchdog] Dispatching FileDownloadedEvent for {final_filename}') self.event_bus.dispatch( FileDownloadedEvent( url=pdf_url, path=download_path, - file_name=unique_filename, + file_name=final_filename, file_size=response_size, file_type='pdf', mime_type='application/pdf', diff --git a/browser_use/filesystem/file_system.py b/browser_use/filesystem/file_system.py index c0cb7eaa7..1f4fc3f18 100644 --- a/browser_use/filesystem/file_system.py +++ b/browser_use/filesystem/file_system.py @@ -272,7 +272,7 @@ class FileSystem: reader = pypdf.PdfReader(full_filename) num_pages = len(reader.pages) - MAX_PDF_PAGES = 10 + MAX_PDF_PAGES = 20 extra_pages = num_pages - MAX_PDF_PAGES extracted_text = '' for page in reader.pages[:MAX_PDF_PAGES]: