diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
index 7c738cfa1..165419f26 100644
--- a/.github/workflows/eval.yaml
+++ b/.github/workflows/eval.yaml
@@ -25,6 +25,19 @@ jobs:
       LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_PROJECT_API_KEY }}
 
     steps:
+      - name: System Info and Resource Check
+        run: |
+          echo "=== SYSTEM INFORMATION ==="
+          echo "Runner OS: $(uname -a)"
+          echo "CPU Info: $(nproc) cores"
+          echo "Memory Info:"
+          free -h
+          echo "Disk Space:"
+          df -h
+          echo "Load Average:"
+          uptime
+          echo "=========================="
+
       - name: Determine ref to checkout
         id: determine_ref
         run: |
@@ -47,7 +60,10 @@ jobs:
           activate-environment: true
 
       - name: Install dependencies
-        run: uv sync --extra eval
+        run: |
+          echo "Installing dependencies..."
+          uv sync --extra eval
+          echo "Dependencies installed successfully"
 
       - name: Detect installed Playwright version
         id: playwright_version
@@ -62,13 +78,31 @@ jobs:
             ${{ runner.os }}-playwright-
 
       - name: Install Playwright browser dependencies
-        run: playwright install --no-shell chromium
+        run: |
+          echo "Installing Playwright browsers..."
+          playwright install --no-shell chromium
+          echo "Playwright browsers installed successfully"
 
       - name: Install Xvfb for headed mode
         if: github.event.client_payload.script_args.headless == 'false'
         run: |
+          echo "Installing Xvfb for headed mode..."
           sudo apt-get update
           sudo apt-get install -y xvfb
+          echo "Xvfb installed successfully"
+
+      - name: Pre-execution Resource Check
+        run: |
+          echo "=== PRE-EXECUTION RESOURCE CHECK ==="
+          echo "Memory usage:"
+          free -h
+          echo "CPU load:"
+          uptime
+          echo "Disk usage:"
+          df -h
+          echo "Process count:"
+          ps aux | wc -l
+          echo "================================="
 
       - name: Construct eval command
         id: eval_command
@@ -82,7 +116,7 @@ jobs:
           DEFAULT_END_INDEX="100"
           DEFAULT_EVAL_GROUP="PRTests"
           DEFAULT_HEADLESS="true"
-          DEFAULT_FRESH_START="true"
+
           DEFAULT_MEMORY_INTERVAL="10"
           DEFAULT_MAX_ACTIONS_PER_STEP="10"
           DEFAULT_PLANNER_INTERVAL="1"
@@ -113,8 +147,7 @@ jobs:
           HEADLESS="${{ github.event.client_payload.script_args.headless }}"
           HEADLESS="${HEADLESS:-$DEFAULT_HEADLESS}"
 
-          FRESH_START="${{ github.event.client_payload.script_args.fresh_start }}"
-          FRESH_START="${FRESH_START:-$DEFAULT_FRESH_START}"
+
 
           MEMORY_INTERVAL="${{ github.event.client_payload.script_args.memory_interval }}"
           MEMORY_INTERVAL="${MEMORY_INTERVAL:-$DEFAULT_MEMORY_INTERVAL}"
@@ -133,6 +166,7 @@ jobs:
           DEVELOPER_ID="${{ github.event.client_payload.script_args.developer_id }}"
           PLANNER_MODEL="${{ github.event.client_payload.script_args.planner_model }}"
           RUN_ID="${{ github.event.client_payload.script_args.run_id }}"
+          LAMINAR_EVAL_ID="${{ github.event.client_payload.script_args.laminar_eval_id }}"
 
           # Build command using array for cleaner construction
           CMD_ARGS=(
@@ -143,7 +177,7 @@ jobs:
             "--max-steps" "$MAX_STEPS"
             "--start" "$START_INDEX"
             "--end" "$END_INDEX"
-            "--fresh-start" "$FRESH_START"
+
             "--eval-group" "$EVAL_GROUP"
             "--memory-interval" "$MEMORY_INTERVAL"
             "--max-actions-per-step" "$MAX_ACTIONS_PER_STEP"
@@ -158,12 +192,15 @@ jobs:
           [[ "${{ github.event.client_payload.script_args.enable_memory }}" == "true" ]] && CMD_ARGS+=("--enable-memory")
           [[ "${{ github.event.client_payload.script_args.validate_output }}" == "true" ]] && CMD_ARGS+=("--validate-output")
           [[ "${{ github.event.client_payload.script_args.include_result }}" == "true" ]] && CMD_ARGS+=("--include-result")
+          [[ "${{ github.event.client_payload.script_args.highlight_elements }}" == "false" ]] && CMD_ARGS+=("--no-highlight-elements")
+          [[ "${{ github.event.client_payload.script_args.use_mind2web_judge }}" == "true" ]] && CMD_ARGS+=("--use-mind2web-judge")
 
           # Add optional string parameters
           [[ -n "$USER_MESSAGE" ]] && CMD_ARGS+=("--user-message" "$USER_MESSAGE")
           [[ -n "$DEVELOPER_ID" ]] && CMD_ARGS+=("--developer-id" "$DEVELOPER_ID")
           [[ -n "$PLANNER_MODEL" ]] && CMD_ARGS+=("--planner-model" "$PLANNER_MODEL")
           [[ -n "$RUN_ID" ]] && CMD_ARGS+=("--run-id" "$RUN_ID")
+          [[ -n "$LAMINAR_EVAL_ID" ]] && CMD_ARGS+=("--laminar-eval-id" "$LAMINAR_EVAL_ID")
 
           # Convert array to command string with proper escaping
           printf -v CMD_STRING '%q ' "${CMD_ARGS[@]}"
@@ -176,5 +213,179 @@ jobs:
           echo "FULL_COMMAND=$CMD_STRING" >> $GITHUB_OUTPUT
           echo "::notice title=Eval Command::$CMD_STRING"
 
+      - name: Start Resource Monitoring
+        run: |
+          echo "Starting background resource monitoring..."
+          # Create a background script that monitors resources every 30 seconds
+          cat > monitor_resources.sh << 'EOF'
+          #!/bin/bash
+          while true; do
+            echo "=== RESOURCE MONITOR $(date) ==="
+            echo "Memory:"
+            free -h
+            echo "CPU Load:"
+            uptime
+            echo "Top processes by CPU:"
+            ps aux --sort=-%cpu | head -10
+            echo "Top processes by Memory:"
+            ps aux --sort=-%mem | head -10
+            echo "Chrome/Chromium processes:"
+            ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
+            echo "Python processes:"
+            ps aux | grep python | grep -v grep || echo "No Python processes found"
+            echo "=================================="
+            sleep 30
+          done
+          EOF
+          chmod +x monitor_resources.sh
+          # Start the monitor in background and save PID
+          nohup ./monitor_resources.sh > resource_monitor.log 2>&1 &
+          echo $! > monitor_pid.txt
+          echo "Resource monitoring started with PID: $(cat monitor_pid.txt)"
+
       - name: Run evaluation script
-        run: ${{ steps.eval_command.outputs.FULL_COMMAND }}
+        id: run_eval
+        run: |
+          echo "=== STARTING EVALUATION ==="
+          echo "Command: ${{ steps.eval_command.outputs.FULL_COMMAND }}"
+          echo "Starting time: $(date)"
+          echo "Use Mind2Web Judge: ${{ github.event.client_payload.script_args.use_mind2web_judge }}"
+          echo "============================"
+          
+          # Set up signal handlers and run the command
+          set -e
+          trap 'echo "=== EVALUATION INTERRUPTED ==="; echo "Time: $(date)"; echo "Last 50 lines of output:"; tail -50 eval_output.log; exit 130' INT TERM
+          
+          # Run the evaluation with output capture and better error handling
+          set +e  # Don't exit on errors, we want to capture them
+          ${{ steps.eval_command.outputs.FULL_COMMAND }} 2>&1 | tee eval_output.log
+          EVAL_EXIT_CODE=${PIPESTATUS[0]}
+          set -e  # Re-enable exit on error
+          
+          echo "=== EVALUATION COMPLETED ==="
+          echo "Exit code: $EVAL_EXIT_CODE"
+          echo "Completion time: $(date)"
+          echo "============================"
+          
+          # Show last part of log for context
+          if [ $EVAL_EXIT_CODE -ne 0 ]; then
+            echo "=== EVALUATION FAILED - LAST 100 LINES OF OUTPUT ==="
+            tail -100 eval_output.log
+            echo "=================================================="
+          fi
+          
+          exit $EVAL_EXIT_CODE
+
+      - name: Post-execution Resource Check
+        if: always()
+        run: |
+          echo "=== POST-EXECUTION RESOURCE CHECK ==="
+          echo "Memory usage:"
+          free -h
+          echo "CPU load:"
+          uptime
+          echo "Disk usage:"
+          df -h
+          echo "Process count:"
+          ps aux | wc -l
+          echo "Chrome/Chromium processes still running:"
+          ps aux | grep -i chrome | grep -v grep || echo "No Chrome processes found"
+          echo "Python processes still running:"
+          ps aux | grep python | grep -v grep || echo "No Python processes found"
+          echo "==================================="
+
+      - name: Stop Resource Monitoring and Collect Logs
+        if: always()
+        run: |
+          echo "Stopping resource monitoring..."
+          if [ -f monitor_pid.txt ]; then
+            MONITOR_PID=$(cat monitor_pid.txt)
+            if kill -0 $MONITOR_PID 2>/dev/null; then
+              kill $MONITOR_PID
+              echo "Resource monitor stopped"
+            else
+              echo "Resource monitor was already stopped"
+            fi
+          fi
+          
+          echo "=== RESOURCE MONITORING LOG ==="
+          if [ -f resource_monitor.log ]; then
+            tail -100 resource_monitor.log
+          else
+            echo "No resource monitor log found"
+          fi
+          echo "==============================="
+
+      - name: Collect Debug Information
+        if: always()
+        run: |
+          echo "=== COLLECTING DEBUG INFORMATION ==="
+          
+          # System information
+          echo "Final system state:"
+          uptime
+          free -h
+          df -h
+          
+          # Process information
+          echo "All running processes:"
+          ps aux --sort=-%cpu | head -20
+          
+          # Check for core dumps
+          echo "Checking for core dumps:"
+          find . -name "core*" -type f 2>/dev/null || echo "No core dumps found"
+          
+          # Check for any crash logs
+          echo "Checking for crash logs:"
+          find . -name "*crash*" -type f 2>/dev/null || echo "No crash logs found"
+          
+          # Check kernel messages for OOM kills
+          echo "Checking for OOM kills in kernel messages:"
+          sudo dmesg | grep -i "killed process" | tail -10 || echo "No OOM kills found"
+          
+          # Check evaluation output
+          echo "Last 100 lines of evaluation output:"
+          if [ -f eval_output.log ]; then
+            tail -100 eval_output.log
+          else
+            echo "No evaluation output log found"
+          fi
+          
+          # Check for saved trajectories
+          echo "Saved trajectories directory:"
+          if [ -d saved_trajectories ]; then
+            find saved_trajectories -type f -name "*.json" | head -10
+            echo "Total trajectory files: $(find saved_trajectories -type f -name "*.json" | wc -l)"
+          else
+            echo "No saved_trajectories directory found"
+          fi
+          
+          echo "===================================="
+
+      - name: Upload Debug Artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: debug-logs-${{ github.run_id }}
+          path: |
+            eval_output.log
+            resource_monitor.log
+            saved_trajectories/
+          retention-days: 7
+
+      - name: Final Status Summary
+        if: always()
+        run: |
+          echo "=== FINAL STATUS SUMMARY ==="
+          echo "Workflow run ID: ${{ github.run_id }}"
+          echo "Job completion time: $(date)"
+          echo "Evaluation step status: ${{ steps.run_eval.outcome }}"
+          
+          if [ "${{ steps.run_eval.outcome }}" != "success" ]; then
+            echo "❌ Evaluation failed or was interrupted"
+            echo "Check the debug artifacts and logs above for more information"
+          else
+            echo "✅ Evaluation completed successfully"
+          fi
+          
+          echo "==========================="
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 02cd0b0dd..43a878378 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -39,4 +39,5 @@ jobs:
       - uses: astral-sh/setup-uv@v6
         with:
           enable-cache: true
+      - run: uv sync --dev --all-extras  # install extras for examples to avoid pyright missing imports errors
       - run: uv run pyright
diff --git a/CLAUDE.md b/CLAUDE.md
index 74263642e..87a2365c0 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -5,20 +5,22 @@ We want our library APIs to be ergonomic, intuitive, and hard to get wrong.
 
 - Use async python
 - Use tabs for indentation in all python code, not spaces
-- Use the modern python >3.12 typing style, e.g. use `str | None` instead of `Optional[str]`, and `list[str]` instead of `List[str]`
+- Use the modern python >3.12 typing style, e.g. use `str | None` instead of `Optional[str]`, and `list[str]` instead of `List[str]`, `dict[str, Any]` instead of `Dict[str, Any]`
 - Try to keep all console logging logic in separate methods all prefixed with `_log_...`, e.g. `def _log_pretty_path(path: Path) -> str` so as not to clutter up the main logic.
 - Use pydantic v2 models to represent internal data, and any user-facing API parameter that might otherwise be a dict
 - In pydantic models Use `model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True, ...)` etc. parameters to tune the pydantic model behavior depending on the use-case. Use `Annotated[..., AfterValidator(...)]` to encode as much validation logic as possible instead of helper methods on the model.
 - We keep the main code for each sub-component in a `service.py` file usually, and we keep most pydantic models in `views.py` files unless they are long enough deserve their own file
 - Use runtime assertions at the start and end of functions to enforce constraints and assumptions
 - Prefer `from uuid_extensions import uuid7str` +  `id: str = Field(default_factory=uuid7str)` for all new id fields
+- Run tests using `uv run pytest -vxs tests/ci`
+- Run the type checker using `uv run pyright`
 
 ## Keep Examples & Tests Up-To-Date
 
 - Make sure to read relevant examples in the `examples/` directory for context and keep them up-to-date when making changes.
 - Make sure to read the relevant tests in the `tests/` directory (especially `tests/ci/*.py`) and keep them up-to-date as well. 
 - Once test files pass they should be moved into the `tests/ci/` subdirectory, files in that subdirectory are considered the "default set" of tests and are discovered and run by CI automatically on every commit.
-- Try to almost never use mocks in tests, instead use pytest fixtures to set up real objects
+- Never use mocks in tests other than for the llm, instead use pytest fixtures to set up real objects and pytest-httpserver
 - Never use real remote URLs in tests (e.g. `https://google.com` or `https://example.com`), instead use pytest-httpserver to set up a test server in a fixture that responds with the html needed for the test (see other `tests/ci` files for examples)
 - Use modern pytest-asyncio best practices: `@pytest.mark.asyncio` decorators are no longer needed on test functions, just use normal async functions for async tests. Use `loop = asyncio.get_event_loop()` inside tests that need it instead of passing `event_loop` as a function argument. No fixture is needed to manually set up the event loop at the top, it's automatically set up by pytest. Fixture functions (even async ones) only need a simple `@pytest.fixture` decorator with no arguments.
 
diff --git a/bin/lint.sh b/bin/lint.sh
index 8a6029dbb..214ea2535 100755
--- a/bin/lint.sh
+++ b/bin/lint.sh
@@ -9,4 +9,5 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 cd "$SCRIPT_DIR/.." || exit 1
 
-exec uv run pre-commit run --all-files
+uv run pre-commit run --all-files
+exec pyright
diff --git a/browser_use/agent/cloud_events.py b/browser_use/agent/cloud_events.py
index 0a507323a..5b1092fa9 100644
--- a/browser_use/agent/cloud_events.py
+++ b/browser_use/agent/cloud_events.py
@@ -5,8 +5,8 @@ from pydantic import Field, field_validator
 from uuid_extensions import uuid7str
 
 MAX_STRING_LENGTH = 100000  # 100K chars ~ 25k tokens should be enough
-MAX_URL_LENGTH = 10000
-MAX_TASK_LENGTH = 10000
+MAX_URL_LENGTH = 100000
+MAX_TASK_LENGTH = 100000
 MAX_COMMENT_LENGTH = 2000
 MAX_FILE_CONTENT_SIZE = 50 * 1024 * 1024  # 50MB
 
@@ -41,6 +41,9 @@ class UpdateAgentTaskEvent(BaseEvent):
 			done_output=done_output,
 			finished_at=datetime.now(timezone.utc) if agent.state.history and agent.state.history.is_done() else None,
 			agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
+			user_feedback_type=None,
+			user_comment=None,
+			gif_url=None,
 			# user_feedback_type and user_comment would be set by the API/frontend
 			# gif_url would be set after GIF generation if needed
 		)
@@ -192,6 +195,9 @@ class CreateAgentTaskEvent(BaseEvent):
 			done_output=None,
 			started_at=datetime.fromtimestamp(agent._task_start_time, tz=timezone.utc),
 			finished_at=None,
+			user_feedback_type=None,
+			user_comment=None,
+			gif_url=None,
 		)
 
 
diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py
index c83aa30cd..e08e17924 100644
--- a/browser_use/agent/gif.py
+++ b/browser_use/agent/gif.py
@@ -8,6 +8,7 @@ import platform
 from typing import TYPE_CHECKING
 
 from browser_use.agent.views import AgentHistoryList
+from browser_use.config import CONFIG
 
 if TYPE_CHECKING:
 	from PIL import Image, ImageFont
@@ -80,7 +81,7 @@ def create_history_gif(
 			try:
 				if platform.system() == 'Windows':
 					# Need to specify the abs font path on Windows
-					font_name = os.path.join(os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts'), font_name + '.ttf')
+					font_name = os.path.join(CONFIG.WIN_FONT_DIR, font_name + '.ttf')
 				regular_font = ImageFont.truetype(font_name, font_size)
 				title_font = ImageFont.truetype(font_name, title_font_size)
 				goal_font = ImageFont.truetype(font_name, goal_font_size)
diff --git a/browser_use/agent/memory/service.py b/browser_use/agent/memory/service.py
index 34b100bc2..a57fdc4de 100644
--- a/browser_use/agent/memory/service.py
+++ b/browser_use/agent/memory/service.py
@@ -15,6 +15,7 @@ from langchain_core.messages.utils import convert_to_openai_messages
 from browser_use.agent.memory.views import MemoryConfig
 from browser_use.agent.message_manager.service import MessageManager
 from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata
+from browser_use.config import CONFIG
 from browser_use.utils import time_execution_sync
 
 
@@ -65,7 +66,7 @@ class Memory:
 		# Check for required packages
 		try:
 			# also disable mem0's telemetry when ANONYMIZED_TELEMETRY=False
-			if os.getenv('ANONYMIZED_TELEMETRY', 'true').lower()[0] in 'fn0':
+			if not CONFIG.ANONYMIZED_TELEMETRY:
 				os.environ['MEM0_TELEMETRY'] = 'False'
 			from mem0 import Memory as Mem0Memory
 		except ImportError:
diff --git a/browser_use/agent/memory/views.py b/browser_use/agent/memory/views.py
index 670c60fce..2a52adc0c 100644
--- a/browser_use/agent/memory/views.py
+++ b/browser_use/agent/memory/views.py
@@ -80,7 +80,7 @@ class MemoryConfig(BaseModel):
 		Returns the vector store configuration dictionary for Mem0,
 		tailored to the selected provider.
 		"""
-		provider_specific_config = {'embedding_model_dims': self.embedder_dims}
+		provider_specific_config: dict[str, Any] = {'embedding_model_dims': self.embedder_dims}
 
 		# --- Default collection_name handling ---
 		if self.vector_store_collection_name:
@@ -167,7 +167,7 @@ class MemoryConfig(BaseModel):
 		}
 
 	@property
-	def full_config_dict(self) -> dict[str, dict[str, Any]]:
+	def full_config_dict(self) -> dict[str, Any]:
 		"""Returns the complete configuration dictionary for Mem0."""
 		return {
 			'embedder': self.embedder_config_dict,
diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
index ce981b501..d52e0b45a 100644
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -13,7 +13,7 @@ from langchain_core.messages import (
 )
 from pydantic import BaseModel
 
-from browser_use.agent.message_manager.views import MessageMetadata
+from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata
 from browser_use.agent.prompts import AgentMessagePrompt
 from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
 from browser_use.browser.views import BrowserStateSummary
@@ -109,7 +109,7 @@ def _log_extract_message_content(message: BaseMessage, is_last_message: bool, me
 		cleaned_content = _log_clean_whitespace(str(message.content))
 
 		# Handle AIMessages with tool calls
-		if hasattr(message, 'tool_calls') and message.tool_calls and not cleaned_content:
+		if isinstance(message, AIMessage) and hasattr(message, 'tool_calls') and message.tool_calls and not cleaned_content:
 			tool_call = message.tool_calls[0]
 			tool_name = tool_call.get('name', 'unknown')
 
@@ -117,7 +117,7 @@ def _log_extract_message_content(message: BaseMessage, is_last_message: bool, me
 				# Skip formatting for init example messages
 				if metadata and metadata.message_type == 'init':
 					return '[Example AgentOutput]'
-				content = _log_format_agent_output_content(tool_call)
+				content = _log_format_agent_output_content(dict(tool_call))  # Convert ToolCall to dict
 			else:
 				content = f'[TOOL: {tool_name}]'
 		else:
@@ -141,9 +141,12 @@ def _log_format_message_line(
 		lines = []
 
 		# Get emoji and token info
-		message_type = message_with_metadata.message.__class__.__name__
-		emoji = _log_get_message_emoji(message_type)
-		token_str = str(message_with_metadata.metadata.tokens).rjust(4)
+		if isinstance(message_with_metadata, ManagedMessage):
+			message_type = message_with_metadata.message.__class__.__name__
+			emoji = _log_get_message_emoji(message_type)
+			token_str = str(message_with_metadata.metadata.tokens).rjust(4)
+		else:
+			return ['❓[   ?]: [Invalid message format]']
 		prefix = f'{emoji}[{token_str}]: '
 
 		# Calculate available width (emoji=2 visual cols + [token]: =8 chars)
@@ -201,6 +204,7 @@ class MessageManager:
 		task: str,
 		system_message: SystemMessage,
 		file_system: FileSystem,
+		available_file_paths: list[str] | None = None,
 		settings: MessageManagerSettings = MessageManagerSettings(),
 		state: MessageManagerState = MessageManagerState(),
 	):
@@ -209,9 +213,10 @@ class MessageManager:
 		self.state = state
 		self.system_prompt = system_message
 		self.file_system = file_system
-		self.agent_history_description = 'Agent initialized.\n'
+		self.agent_history_description = '<system>Agent initialized</system>\n'
 		self.read_state_description = ''
 		self.sensitive_data_description = ''
+		self.available_file_paths = available_file_paths
 		# Only initialize messages if state is empty
 		if len(self.state.history.messages) == 0:
 			self._init_messages()
@@ -340,15 +345,9 @@ My next action is to click on the iPhone link at index [4] to navigate to Apple'
 		# self._add_message_with_tokens(example_tool_call_2, message_type='init')
 		# self.add_tool_message(content='Clicked on index [4]. </example_2>', message_type='init')
 
-		if self.settings.available_file_paths:
-			filepaths_msg = HumanMessage(
-				content=f'<available_file_paths>Here are file paths you can use: {self.settings.available_file_paths}</available_file_paths>'
-			)
-			self._add_message_with_tokens(filepaths_msg, message_type='init')
-
 	def add_new_task(self, new_task: str) -> None:
 		self.task = new_task
-		self.agent_history_description += f'\nUser updated USER REQUEST to: {new_task}\n'
+		self.agent_history_description += f'\n<system>User updated USER REQUEST to: {new_task}</system>\n'
 
 	def _update_agent_history_description(
 		self,
@@ -362,8 +361,7 @@ My next action is to click on the iPhone link at index [4] to navigate to Apple'
 			result = []
 		step_number = step_info.step_number if step_info else 'unknown'
 
-		self.read_state_initialization = 'This is displayed only **one time**, save this information if you need it later.\n'
-		self.read_state_description = self.read_state_initialization
+		self.read_state_description = ''
 
 		action_results = ''
 		result_len = len(result)
@@ -373,36 +371,36 @@ My next action is to click on the iPhone link at index [4] to navigate to Apple'
 				logger.debug(f'Added extracted_content to read_state_description: {action_result.extracted_content}')
 
 			if action_result.long_term_memory:
-				action_results += f'Action {idx + 1}/{result_len} response: {action_result.long_term_memory}\n'
+				action_results += f'Action {idx + 1}/{result_len}: {action_result.long_term_memory}\n'
 				logger.debug(f'Added long_term_memory to action_results: {action_result.long_term_memory}')
 			elif action_result.extracted_content and not action_result.include_extracted_content_only_once:
-				action_results += f'Action {idx + 1}/{result_len} response: {action_result.extracted_content}\n'
+				action_results += f'Action {idx + 1}/{result_len}: {action_result.extracted_content}\n'
 				logger.debug(f'Added extracted_content to action_results: {action_result.extracted_content}')
 
 			if action_result.error:
-				action_results += f'Action {idx + 1}/{result_len} response: {action_result.error[:200]}\n'
+				action_results += f'Action {idx + 1}/{result_len}: {action_result.error[:200]}\n'
 				logger.debug(f'Added error to action_results: {action_result.error[:200]}')
 
+		if action_results:
+			action_results = f'Action Results:\n{action_results}'
+		action_results = action_results.strip('\n')
+
 		# Handle case where model_output is None (e.g., parsing failed)
 		if model_output is None:
-			if step_number > 0:
-				self.agent_history_description += f"""## Step {step_number}
-No model output (parsing failed)
-{action_results}
+			if isinstance(step_number, int) and step_number > 0:
+				self.agent_history_description += f"""<step_{step_number}>
+Agent failed to output in the right format.
+</step_{step_number}>
 """
 		else:
-			self.agent_history_description += f"""## Step {step_number}
-Step evaluation: {model_output.current_state.evaluation_previous_goal}
-Step memory: {model_output.current_state.memory}
-Step goal: {model_output.current_state.next_goal}
+			self.agent_history_description += f"""<step_{step_number}>
+Evaluation of Previous Step: {model_output.current_state.evaluation_previous_goal}
+Memory: {model_output.current_state.memory}
+Next Goal: {model_output.current_state.next_goal}
 {action_results}
+</step_{step_number}>
 """
 
-		if self.read_state_description == self.read_state_initialization:
-			self.read_state_description = ''
-		else:
-			self.read_state_description += '\nMAKE SURE TO SAVE THIS INFORMATION INTO A FILE OR TO MEMORY IF YOU NEED IT LATER.'
-
 	def _get_sensitive_data_description(self, current_page_url) -> str:
 		sensitive_data = self.settings.sensitive_data
 		if not sensitive_data:
@@ -454,6 +452,7 @@ Step goal: {model_output.current_state.next_goal}
 			step_info=step_info,
 			page_filtered_actions=page_filtered_actions,
 			sensitive_data=self.sensitive_data_description,
+			available_file_paths=self.available_file_paths,
 		).get_user_message(use_vision)
 		self._add_message_with_tokens(state_message)
 
diff --git a/browser_use/agent/message_manager/tests.py b/browser_use/agent/message_manager/tests.py
deleted file mode 100644
index d8293788e..000000000
--- a/browser_use/agent/message_manager/tests.py
+++ /dev/null
@@ -1,246 +0,0 @@
-import pytest
-from langchain_anthropic import ChatAnthropic
-from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
-from langchain_openai import AzureChatOpenAI, ChatOpenAI
-
-from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
-from browser_use.agent.views import ActionResult
-from browser_use.browser.views import BrowserStateSummary, TabInfo
-from browser_use.dom.views import DOMElementNode, DOMTextNode
-from browser_use.filesystem.file_system import FileSystem
-
-
-@pytest.fixture(
-	params=[
-		ChatOpenAI(model='gpt-4o-mini'),
-		AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'),
-		ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None),
-	],
-	ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'],
-)
-def message_manager(request: pytest.FixtureRequest):
-	task = 'Test task'
-	action_descriptions = 'Test actions'
-
-	import os
-	import tempfile
-	import uuid
-
-	base_tmp = tempfile.gettempdir()  # e.g., /tmp on Unix
-	file_system_path = os.path.join(base_tmp, str(uuid.uuid4()))
-	return MessageManager(
-		task=task,
-		system_message=SystemMessage(content=action_descriptions),
-		settings=MessageManagerSettings(
-			max_input_tokens=1000,
-			estimated_characters_per_token=3,
-			image_tokens=800,
-		),
-		file_system=FileSystem(file_system_path),
-	)
-
-
-def test_initial_messages(message_manager: MessageManager):
-	"""Test that message manager initializes with system and task messages"""
-	messages = message_manager.get_messages()
-	assert len(messages) == 2
-	assert isinstance(messages[0], SystemMessage)
-	assert isinstance(messages[1], HumanMessage)
-	assert 'Test task' in messages[1].content
-
-
-def test_add_state_message(message_manager: MessageManager):
-	"""Test adding browser state message"""
-	state = BrowserStateSummary(
-		url='https://test.com',
-		title='Test Page',
-		element_tree=DOMElementNode(
-			tag_name='div',
-			attributes={},
-			children=[],
-			is_visible=True,
-			parent=None,
-			xpath='//div',
-		),
-		selector_map={},
-		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
-	)
-	message_manager.add_state_message(browser_state_summary=state)
-
-	messages = message_manager.get_messages()
-	assert len(messages) == 3
-	assert isinstance(messages[2], HumanMessage)
-	assert 'https://test.com' in messages[2].content
-
-
-def test_add_state_with_memory_result(message_manager: MessageManager):
-	"""Test adding state with result that should be included in memory"""
-	state = BrowserStateSummary(
-		url='https://test.com',
-		title='Test Page',
-		element_tree=DOMElementNode(
-			tag_name='div',
-			attributes={},
-			children=[],
-			is_visible=True,
-			parent=None,
-			xpath='//div',
-		),
-		selector_map={},
-		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
-	)
-	result = ActionResult(extracted_content='Important content', include_in_memory=True)
-
-	message_manager.add_state_message(browser_state_summary=state, result=[result])
-	messages = message_manager.get_messages()
-
-	# Should have system, task, extracted content, and state messages
-	assert len(messages) == 4
-	assert 'Important content' in messages[2].content
-	assert isinstance(messages[2], HumanMessage)
-	assert isinstance(messages[3], HumanMessage)
-	assert 'Important content' not in messages[3].content
-
-
-def test_add_state_with_non_memory_result(message_manager: MessageManager):
-	"""Test adding state with result that should not be included in memory"""
-	state = BrowserStateSummary(
-		url='https://test.com',
-		title='Test Page',
-		element_tree=DOMElementNode(
-			tag_name='div',
-			attributes={},
-			children=[],
-			is_visible=True,
-			parent=None,
-			xpath='//div',
-		),
-		selector_map={},
-		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
-	)
-	result = ActionResult(extracted_content='Temporary content', include_in_memory=False)
-
-	message_manager.add_state_message(browser_state_summary=state, result=[result])
-	messages = message_manager.get_messages()
-
-	# Should have system, task, and combined state+result message
-	assert len(messages) == 3
-	assert 'Temporary content' in messages[2].content
-	assert isinstance(messages[2], HumanMessage)
-
-
-@pytest.mark.skip('not sure how to fix this')
-@pytest.mark.parametrize('max_tokens', [100000, 10000, 5000])
-def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens):
-	"""Test handling of token overflow in a realistic message flow"""
-	# Set more realistic token limit
-	message_manager.settings.max_input_tokens = max_tokens
-
-	# Create a long sequence of interactions
-	for i in range(200):  # Simulate 40 steps of interaction
-		# Create state with varying content length
-		state = BrowserStateSummary(
-			url=f'https://test{i}.com',
-			title=f'Test Page {i}',
-			element_tree=DOMElementNode(
-				tag_name='div',
-				attributes={},
-				children=[
-					DOMTextNode(
-						text=f'Content {j} ' * (10 + i),  # Increasing content length
-						is_visible=True,
-						parent=None,
-					)
-					for j in range(5)  # Multiple DOM items
-				],
-				is_visible=True,
-				parent=None,
-				xpath='//div',
-			),
-			selector_map={j: f'//div[{j}]' for j in range(5)},
-			tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')],
-		)
-
-		# Alternate between different types of results
-		result = None
-		if i % 2 == 0:  # Every other iteration
-			result = ActionResult(
-				extracted_content=f'Important content from step {i}' * 5,
-				include_in_memory=i % 4 == 0,  # Include in memory every 4th message
-			)
-
-		# Add state message
-		if result:
-			message_manager.add_state_message(browser_state_summary=state, result=[result])
-		else:
-			message_manager.add_state_message(browser_state_summary=state)
-
-		try:
-			messages = message_manager.get_messages()
-		except ValueError as e:
-			if 'Max token limit reached - history is too long' in str(e):
-				return  # If error occurs, end the test
-			else:
-				raise e
-
-		assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100
-
-		last_msg = messages[-1]
-		assert isinstance(last_msg, HumanMessage)
-
-		if i % 4 == 0:
-			assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage)
-		if i % 2 == 0 and not i % 4 == 0:
-			if isinstance(last_msg.content, list):
-				assert 'Current url: https://test' in last_msg.content[0]['text']
-			else:
-				assert 'Current url: https://test' in last_msg.content
-
-		# Add model output every time
-		from browser_use.agent.views import AgentBrain, AgentOutput
-		from browser_use.controller.registry.views import ActionModel
-
-		output = AgentOutput(
-			current_state=AgentBrain(
-				evaluation_previous_goal=f'Success in step {i}',
-				memory=f'Memory from step {i}',
-				next_goal=f'Goal for step {i + 1}',
-			),
-			action=[ActionModel()],
-		)
-		message_manager._remove_last_state_message()
-		message_manager.add_model_output(output)
-
-		# Get messages and verify after each addition
-		messages = [m.message for m in message_manager.state.history.messages]
-
-		# Verify token limit is respected
-
-		# Verify essential messages are preserved
-		assert isinstance(messages[0], SystemMessage)  # System prompt always first
-		assert isinstance(messages[1], HumanMessage)  # Task always second
-		assert 'Test task' in messages[1].content
-
-		# Verify structure of latest messages
-		assert isinstance(messages[-1], AIMessage)  # Last message should be model output
-		assert f'step {i}' in messages[-1].content  # Should contain current step info
-
-		# Log token usage for debugging
-		token_usage = message_manager.state.history.current_tokens
-		token_limit = message_manager.settings.max_input_tokens
-		# print(f'Step {i}: Using {token_usage}/{token_limit} tokens')
-
-		# go through all messages and verify that the token count and total tokens is correct
-		total_tokens = 0
-		real_tokens = []
-		stored_tokens = []
-		for msg in message_manager.state.history.messages:
-			total_tokens += msg.metadata.tokens
-			stored_tokens.append(msg.metadata.tokens)
-			real_tokens.append(message_manager._count_tokens(msg.message))
-		assert total_tokens == sum(real_tokens)
-		assert stored_tokens == real_tokens
-		assert message_manager.state.history.current_tokens == total_tokens
-
-
-# pytest -s browser_use/agent/message_manager/tests.py
diff --git a/browser_use/agent/message_manager/utils.py b/browser_use/agent/message_manager/utils.py
index 55eae8fd5..beae1856c 100644
--- a/browser_use/agent/message_manager/utils.py
+++ b/browser_use/agent/message_manager/utils.py
@@ -31,42 +31,50 @@ def is_model_without_tool_support(model_name: str) -> bool:
 def extract_json_from_model_output(content: str | BaseMessage) -> dict:
 	"""Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
 	try:
+		# Extract string content from BaseMessage if needed
+		content_str: str
 		if isinstance(content, BaseMessage):
 			# for langchain_core.messages.BaseMessage
-			content = content.content
+			msg_content = content.content
+			if isinstance(msg_content, list):
+				content_str = str(msg_content[0]) if msg_content else ''
+			else:
+				content_str = msg_content
+		else:
+			content_str = content
 		# If content is wrapped in code blocks, extract just the JSON part
-		if '```' in content:
+		if '```' in content_str:
 			# Find the JSON content between code blocks
-			content = content.split('```')[1]
+			content_str = content_str.split('```')[1]
 			# Remove language identifier if present (e.g., 'json\n')
-			if '\n' in content:
-				content = content.split('\n', 1)[1]
+			if '\n' in content_str:
+				content_str = content_str.split('\n', 1)[1]
 
 		# remove html-like tags before the first { and after the last }
 		# This handles cases like <|header_start|>assistant<|header_end|> and <function=AgentOutput>
 		# Only remove content before { if content doesn't already start with {
-		if not content.strip().startswith('{'):
-			content = re.sub(r'^.*?(?=\{)', '', content, flags=re.DOTALL)
+		if not content_str.strip().startswith('{'):
+			content_str = re.sub(r'^.*?(?=\{)', '', content_str, flags=re.DOTALL)
 
 		# Remove common HTML-like tags and patterns at the end, but be more conservative
 		# Look for patterns like </function>, <|header_start|>, etc. after the JSON
-		content = re.sub(r'\}(\s*<[^>]*>.*?$)', '}', content, flags=re.DOTALL)
-		content = re.sub(r'\}(\s*<\|[^|]*\|>.*?$)', '}', content, flags=re.DOTALL)
+		content_str = re.sub(r'\}(\s*<[^>]*>.*?$)', '}', content_str, flags=re.DOTALL)
+		content_str = re.sub(r'\}(\s*<\|[^|]*\|>.*?$)', '}', content_str, flags=re.DOTALL)
 
 		# Handle extra characters after the JSON, including stray braces
 		# Find the position of the last } that would close the main JSON object
-		content = content.strip()
+		content_str = content_str.strip()
 
-		if content.endswith('}'):
+		if content_str.endswith('}'):
 			# Try to parse and see if we get valid JSON
 			try:
-				json.loads(content)
+				json.loads(content_str)
 			except json.JSONDecodeError:
 				# If parsing fails, try to find the correct end of the JSON
 				# by counting braces and removing anything after the balanced JSON
 				brace_count = 0
 				last_valid_pos = -1
-				for i, char in enumerate(content):
+				for i, char in enumerate(content_str):
 					if char == '{':
 						brace_count += 1
 					elif char == '}':
@@ -76,14 +84,14 @@ def extract_json_from_model_output(content: str | BaseMessage) -> dict:
 							break
 
 				if last_valid_pos > 0:
-					content = content[:last_valid_pos]
+					content_str = content_str[:last_valid_pos]
 
 		# Fix control characters in JSON strings before parsing
 		# This handles cases where literal control characters appear in JSON values
-		content = _fix_control_characters_in_json(content)
+		content_str = _fix_control_characters_in_json(content_str)
 
 		# Parse the cleaned content
-		result_dict = json.loads(content)
+		result_dict = json.loads(content_str)
 
 		# if the key "function" and parameter key like "params"/"args"/"kwargs"/"parameters" are present, the final result is the value of the parameter key
 		if 'function' in result_dict:
diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py
index a7106f587..6f12b89de 100644
--- a/browser_use/agent/prompts.py
+++ b/browser_use/agent/prompts.py
@@ -36,7 +36,7 @@ class SystemPrompt:
 		"""Load the prompt template from the markdown file."""
 		try:
 			# This works both in development and when installed as a package
-			with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r') as f:
+			with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r', encoding='utf-8') as f:
 				self.prompt_template = f.read()
 		except Exception as e:
 			raise RuntimeError(f'Failed to load system prompt template: {e}')
@@ -73,9 +73,10 @@ class AgentMessagePrompt:
 		page_filtered_actions: str | None = None,
 		max_clickable_elements_length: int = 40000,
 		sensitive_data: str | None = None,
+		available_file_paths: list[str] | None = None,
 	):
 		self.browser_state: 'BrowserStateSummary' = browser_state_summary
-		self.file_system: 'FileSystem' | None = file_system
+		self.file_system: 'FileSystem | None' = file_system
 		self.agent_history_description: str | None = agent_history_description
 		self.read_state_description: str | None = read_state_description
 		self.task: str | None = task
@@ -84,6 +85,7 @@ class AgentMessagePrompt:
 		self.page_filtered_actions: str | None = page_filtered_actions
 		self.max_clickable_elements_length: int = max_clickable_elements_length
 		self.sensitive_data: str | None = sensitive_data
+		self.available_file_paths: list[str] | None = available_file_paths
 		assert self.browser_state
 
 	def _get_browser_state_description(self) -> str:
@@ -143,7 +145,7 @@ Interactive elements from top layer of the current page inside the viewport{trun
 		time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
 		step_info_description += f'Current date and time: {time_str}'
 
-		todo_contents = self.file_system.get_todo_contents()
+		todo_contents = self.file_system.get_todo_contents() if self.file_system else ''
 		if not len(todo_contents):
 			todo_contents = '[Current todo.md is empty, fill it with your plan when applicable]'
 
@@ -152,7 +154,7 @@ Interactive elements from top layer of the current page inside the viewport{trun
 {self.task}
 </user_request>
 <file_system>
-{self.file_system.describe()}
+{self.file_system.describe() if self.file_system else 'No file system available'}
 </file_system>
 <todo_contents>
 {todo_contents}
@@ -162,13 +164,23 @@ Interactive elements from top layer of the current page inside the viewport{trun
 			agent_state += f'<sensitive_data>\n{self.sensitive_data}\n</sensitive_data>\n'
 
 		agent_state += f'<step_info>\n{step_info_description}\n</step_info>\n'
+		if self.available_file_paths:
+			agent_state += '<available_file_paths>\n' + '\n'.join(self.available_file_paths) + '\n</available_file_paths>\n'
 		return agent_state
 
 	def get_user_message(self, use_vision: bool = True) -> HumanMessage:
-		state_description = '<agent_history>\n' + self.agent_history_description.strip('\n') + '\n</agent_history>\n'
+		state_description = (
+			'<agent_history>\n'
+			+ (self.agent_history_description.strip('\n') if self.agent_history_description else '')
+			+ '\n</agent_history>\n'
+		)
 		state_description += '<agent_state>\n' + self._get_agent_state_description().strip('\n') + '\n</agent_state>\n'
 		state_description += '<browser_state>\n' + self._get_browser_state_description().strip('\n') + '\n</browser_state>\n'
-		state_description += '<read_state>\n' + self.read_state_description.strip('\n') + '\n</read_state>\n'
+		state_description += (
+			'<read_state>\n'
+			+ (self.read_state_description.strip('\n') if self.read_state_description else '')
+			+ '\n</read_state>\n'
+		)
 		if self.page_filtered_actions:
 			state_description += 'For this page, these additional actions are available:\n'
 			state_description += self.page_filtered_actions + '\n'
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
index 162d2b2ea..ce4b3e8e9 100644
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -17,6 +17,15 @@ from typing import Any, Generic, TypeVar
 
 from dotenv import load_dotenv
 
+load_dotenv()
+
+# from lmnr.sdk.decorators import observe
+from bubus import EventBus
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
+from pydantic import BaseModel, ValidationError
+from uuid_extensions import uuid7str
+
 from browser_use.agent.cloud_events import (
 	CreateAgentOutputFileEvent,
 	CreateAgentSessionEvent,
@@ -24,20 +33,6 @@ from browser_use.agent.cloud_events import (
 	CreateAgentTaskEvent,
 	UpdateAgentTaskEvent,
 )
-
-load_dotenv()
-
-# from lmnr.sdk.decorators import observe
-from bubus import EventBus
-from langchain_core.language_models.chat_models import BaseChatModel
-from langchain_core.messages import (
-	BaseMessage,
-	HumanMessage,
-	SystemMessage,
-)
-from pydantic import BaseModel, ValidationError
-from uuid_extensions import uuid7str
-
 from browser_use.agent.gif import create_history_gif
 from browser_use.agent.memory import Memory, MemoryConfig
 from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
@@ -65,11 +60,13 @@ from browser_use.browser import BrowserProfile, BrowserSession
 from browser_use.browser.session import DEFAULT_BROWSER_PROFILE
 from browser_use.browser.types import Browser, BrowserContext, Page
 from browser_use.browser.views import BrowserStateSummary
+from browser_use.config import CONFIG
 from browser_use.controller.registry.views import ActionModel
 from browser_use.controller.service import Controller
 from browser_use.dom.history_tree_processor.service import DOMHistoryElement, HistoryTreeProcessor
 from browser_use.exceptions import LLMException
 from browser_use.filesystem.file_system import FileSystem
+from browser_use.sync import CloudSync
 from browser_use.telemetry.service import ProductTelemetry
 from browser_use.telemetry.views import AgentTelemetryEvent
 from browser_use.utils import (
@@ -82,8 +79,6 @@ from browser_use.utils import (
 
 logger = logging.getLogger(__name__)
 
-SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1'
-
 
 def log_response(response: AgentOutput, registry=None, logger=None) -> None:
 	"""Utility function to log the model's response."""
@@ -184,6 +179,7 @@ class Agent(Generic[Context]):
 		source: str | None = None,
 		file_system_path: str | None = None,
 		task_id: str | None = None,
+		cloud_sync: CloudSync | None = None,
 	):
 		if page_extraction_llm is None:
 			page_extraction_llm = llm
@@ -304,6 +300,7 @@ class Agent(Generic[Context]):
 				sensitive_data=sensitive_data,
 				available_file_paths=self.settings.available_file_paths,
 			),
+			available_file_paths=self.settings.available_file_paths,
 			state=self.state.message_manager_state,
 		)
 
@@ -442,18 +439,14 @@ class Agent(Generic[Context]):
 		self.telemetry = ProductTelemetry()
 
 		# Event bus with WAL persistence
-		# Default to ~/.config/browseruse/events/{agent_task_id}.jsonl
-		from browser_use.utils import BROWSER_USE_CONFIG_DIR
-
-		wal_path = BROWSER_USE_CONFIG_DIR / 'events' / f'{self.task_id}.jsonl'
+		# Default to ~/.config/browseruse/events/{agent_session_id}.jsonl
+		wal_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'events' / f'{self.session_id}.jsonl'
 		self.eventbus = EventBus(name='Agent', wal_path=wal_path)
 
 		# Cloud sync service
-		self.enable_cloud_sync = os.environ.get('BROWSERUSE_CLOUD_SYNC', 'true').lower()[0] in 'ty1'
-		if self.enable_cloud_sync:
-			from browser_use.sync import CloudSync
-
-			self.cloud_sync = CloudSync()
+		self.enable_cloud_sync = CONFIG.BROWSER_USE_CLOUD_SYNC
+		if self.enable_cloud_sync or cloud_sync is not None:
+			self.cloud_sync = cloud_sync or CloudSync()
 			# Register cloud sync handler
 			self.eventbus.on('*', self.cloud_sync.handle_event)
 
@@ -501,36 +494,6 @@ class Agent(Generic[Context]):
 
 		logger.info(f'💾 File system path: {self.file_system_path}')
 
-		# if file system is set, add actions to the controller
-		@self.controller.registry.action('Write content to file_name in file system, use only .md or .txt extensions.')
-		async def write_file(file_name: str, content: str):
-			result = await self.file_system.write_file(file_name, content)
-			logger.info(f'💾 {result}')
-			return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
-
-		@self.controller.registry.action('Append content to file_name in file system')
-		async def append_file(file_name: str, content: str):
-			result = await self.file_system.append_file(file_name, content)
-			logger.info(f'💾 {result}')
-			return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
-
-		@self.controller.registry.action('Read file_name from file system')
-		async def read_file(file_name: str):
-			result = await self.file_system.read_file(file_name)
-			max_len = 50
-			if len(result) > max_len:
-				display_result = result[:max_len] + '\n...'
-			else:
-				display_result = result
-			logger.info(f'💾 {display_result}')
-			memory = result.split('\n')[-1]
-			return ActionResult(
-				extracted_content=result,
-				include_in_memory=True,
-				long_term_memory=memory,
-				include_extracted_content_only_once=True,
-			)
-
 	def _set_message_context(self) -> str | None:
 		if self.tool_calling_method == 'raw':
 			# For raw tool calling, only include actions with no filters initially
@@ -819,7 +782,7 @@ class Agent(Generic[Context]):
 		# If a specific method is set, use it
 		if self.settings.tool_calling_method != 'auto':
 			# Skip test if already verified
-			if getattr(self.llm, '_verified_api_keys', None) is True or SKIP_LLM_API_KEY_VERIFICATION:
+			if getattr(self.llm, '_verified_api_keys', None) is True or CONFIG.SKIP_LLM_API_KEY_VERIFICATION:
 				setattr(self.llm, '_verified_api_keys', True)
 				setattr(self.llm, '_verified_tool_calling_method', self.settings.tool_calling_method)
 				return self.settings.tool_calling_method
@@ -847,7 +810,7 @@ class Agent(Generic[Context]):
 		known_method = self._get_known_tool_calling_method()
 		if known_method is not None:
 			# Trust known combinations without testing if verification is already done or skipped
-			if getattr(self.llm, '_verified_api_keys', None) is True or SKIP_LLM_API_KEY_VERIFICATION:
+			if getattr(self.llm, '_verified_api_keys', None) is True or CONFIG.SKIP_LLM_API_KEY_VERIFICATION:
 				setattr(self.llm, '_verified_api_keys', True)
 				setattr(self.llm, '_verified_tool_calling_method', known_method)  # Cache on LLM instance
 				self.logger.debug(
@@ -1713,7 +1676,7 @@ class Agent(Generic[Context]):
 			assert browser_state_summary
 			content = AgentMessagePrompt(
 				browser_state_summary=browser_state_summary,
-				result=self.state.last_result,
+				file_system=self.file_system,
 				include_attributes=self.settings.include_attributes,
 			)
 			msg = [SystemMessage(content=system_msg), content.get_user_message(self.settings.use_vision)]
@@ -1960,7 +1923,7 @@ class Agent(Generic[Context]):
 		self.tool_calling_method = self._set_tool_calling_method()
 
 		# Skip verification if already done
-		if getattr(self.llm, '_verified_api_keys', None) is True or SKIP_LLM_API_KEY_VERIFICATION:
+		if getattr(self.llm, '_verified_api_keys', None) is True or CONFIG.SKIP_LLM_API_KEY_VERIFICATION:
 			setattr(self.llm, '_verified_api_keys', True)
 			return True
 
diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
index b05382561..1eab4ef10 100644
--- a/browser_use/agent/system_prompt.md
+++ b/browser_use/agent/system_prompt.md
@@ -1,4 +1,4 @@
-You are a tool-using AI agent designed operating in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
+You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
 
 <intro>
 You excel at following tasks:
@@ -12,46 +12,38 @@ You excel at following tasks:
 
 <language_settings>
 - Default working language: **English**
-- Use the language specified by user in messages as the working language in all messages and tool calls
+- Use the language specified by user in messages as the working language
 </language_settings>
 
 <input>
-At every step, you will be given a state with: 
-1. Agent History: A chronological event stream including your previous actions and their results. This may be partially omitted.
-2. User Request: This is your ultimate objective and always remains visible.
-3. Agent State: Current progress, and relevant contextual memory.
-4. Browser State: Contains current URL, open tabs, interactive elements indexed for actions, visible page content, and (sometimes) screenshots.
-4. Read State: If your previous action involved reading a file or extracting content (e.g., from a webpage), the full result will be included here. This data is **only shown in the current step** and will not appear in future Agent History. You are responsible for saving or interpreting the information appropriately during this step into your file system.
+At every step, your input will consist of: 
+1. <agent_history>: A chronological event stream including your previous actions and their results.
+2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
+3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
+4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
+5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
 </input>
 
 <agent_history>
 Agent history will be given as a list of step information as follows:
 
-Step step_number:
+<step_{{step_number}}>:
 Evaluation of Previous Step: Assessment of last action
-Memory: Agent generated memory of this step
-Actions: Agent generated actions
-Action Results: System generated result of those actions
+Memory: Your memory of this step
+Next Goal: Your goal for this step
+Action Results: Your actions and their results
+</step_{{step_number}}>
+
+and system messages wrapped in <system> tag.
 </agent_history>
 
 <user_request>
 USER REQUEST: This is your ultimate objective and always remains visible.
 - This has the highest priority. Make the user happy.
 - If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
-- If the task is open ended you can plan more yourself how to get it done.
+- If the task is open ended you can plan yourself how to get it done.
 </user_request>
 
-<agent_state>
-Agent State will be given as follows:
-
-File System: A summary of your available files in the format:
-- file_name — num_lines lines
-
-Current Step: The step in the agent loop.
-
-Timestamp: Current date.
-</agent_state>
-
 <browser_state>
 1. Browser State will be given as:
 
@@ -74,14 +66,10 @@ Note that:
 </browser_state>
 
 <browser_vision>
-When a screenshot is provided, analyse it to understand the interactive elements and try to understand what each interactive element is for. Bounding box labels correspond to element indexes. 
+You will be optionally provided with a screenshot of the browser with bounding boxes. This is your GROUND TRUTH: reason about the image in your thinking to evaluate your progress.
+Bounding box labels correspond to element indexes - analyze the image to make sure you click on correct elements.
 </browser_vision>
 
-<read_state>
-1. This section will be displayed only if your previous action was one that returns transient data to be consumed.
-2. You will see this information **only during this step** in your state. ALWAYS make sure to save this information if it will be needed later.
-</read_state>
-
 <browser_rules>
 Strictly follow these rules while using the browser and navigating the web:
 - Only interact with elements that have a numeric [index] assigned.
@@ -91,12 +79,13 @@ Strictly follow these rules while using the browser and navigating the web:
 - By default, only elements in the visible viewport are listed. Use scrolling tools if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page. The extract content action gets the full loaded page content.
 - If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
 - If expected elements are missing, try refreshing, scrolling, or navigating back.
-- Use multiple actions where no page transition is expected (e.g., fill multiple fields then click submit).
 - If the page is not fully loaded, use the wait action.
-- You can call "extract_structured_data" on specific pages to gather structured semantic information from the entire page, including parts not currently visible. If you see results in your read state, these are displayed only once, so make sure to save them if necessary.
+- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible. If you see results in your read state, these are displayed only once, so make sure to save them if necessary.
+- Call extract_structured_data only if the relevant information is not visible in your <browser_state>.
 - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
-- If the USER REQUEST includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient. Sometimes you need to scroll to see all filter options.
-- The USER REQUEST is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
+- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
+- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
+- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
 </browser_rules>
 
 <file_system>
@@ -105,9 +94,11 @@ Strictly follow these rules while using the browser and navigating the web:
   1. `todo.md`: Use this to keep a checklist for known subtasks. Update it to mark completed items and track what remains. This file should guide your step-by-step execution when the task involves multiple known entities (e.g., a list of links or items to visit). The contents of this file will be also visible in your state. ALWAYS use `write_file` to rewrite entire `todo.md` when you want to update your progress. NEVER use `append_file` on `todo.md` as this can explode your context.
   2. `results.md`: Use this to accumulate extracted or generated results for the user. Append each new finding clearly and avoid duplication. This file serves as your output log.
 - You can read, write, and append to files.
-- Note that `write_file` rewrites the entire file, so make sure to repeat all the existing information if you use this action.
+- Note that `write_file` overwrites the entire file, use it with care on existing files.
 - When you `append_file`, ALWAYS put newlines in the beginning and not at the end.
+- If the file is too large, you are only given a preview of your file. Use read_file to see the full content if necessary.
 - Always use the file system as the source of truth. Do not rely on memory alone for tracking task state.
+- If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
 </file_system>
 
 <task_completion_rules>
@@ -129,11 +120,11 @@ The `done` action is your opportunity to terminate and share your findings with
 - You are allowed to use a maximum of {max_actions} actions per step.
 
 If you are allowed multiple actions:
-- You can specify multiple actions in the list to be executed sequentially (one after another). But always specify only one action name per item.
-- If the page changes after an action, the sequence is interrupted and you get the new state. You might have to repeat the same action again so that your changes are reflected in the new state.
-- ONLY use multiple actions when actions should not change the page state significantly.
+- You can specify multiple actions in the list to be executed sequentially (one after another).
+- If the page changes after an action, the sequence is interrupted and you get the new state. You can see this in your agent history when this happens.
+- At every step, use ONLY ONE action to interact with the browser. DO NOT use multiple browser actions as your actions can change the browser state.
 
-If you are allowed 1 action, ALWAYS output only 1 most reasonable action per step. If you have something in your read_state, always prioritize saving the data first.
+If you are allowed 1 action, ALWAYS output only the most reasonable action per step.
 </action_rules>
 
 <reasoning_rules>
@@ -147,8 +138,10 @@ Exhibit the following reasoning patterns to successfully achieve the <user_reque
 - If todo.md is empty and the task is multi-step, generate a stepwise plan in todo.md using file tools.
 - Analyze `todo.md` to guide and track your progress. 
 - If any todo.md items are finished, mark them as complete in the file.
+- Analyze whether you are stuck in the same goal for a few steps. If so, try alternative methods.
 - Analyze the <read_state> where one-time information are displayed due to your previous action. Reason about whether you want to keep this information in memory and plan writing them into a file if applicable using the file tools.
 - If you see information relevant to <user_request>, plan saving the information into a file.
+- Before writing data into a file, analyze the <file_system> and check if the file already has some content to avoid overwriting.
 - Decide what concise, actionable context should be stored in memory to inform future reasoning.
 - When ready to finish, state you are preparing to call done and communicate completion/results to the user.
 - Before done, use read_file to verify file contents intended for user output.
diff --git a/browser_use/agent/tests.py b/browser_use/agent/tests.py
deleted file mode 100644
index 5f58a8e21..000000000
--- a/browser_use/agent/tests.py
+++ /dev/null
@@ -1,197 +0,0 @@
-import pytest
-
-from browser_use.agent.views import (
-	ActionResult,
-	AgentBrain,
-	AgentHistory,
-	AgentHistoryList,
-	AgentOutput,
-)
-from browser_use.browser.views import BrowserStateHistory, BrowserStateSummary, TabInfo
-from browser_use.controller.registry.service import Registry
-from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction
-from browser_use.dom.views import DOMElementNode
-
-
-@pytest.fixture
-def sample_browser_state():
-	return BrowserStateSummary(
-		url='https://example.com',
-		title='Example Page',
-		tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)],
-		screenshot='screenshot1.png',
-		element_tree=DOMElementNode(
-			tag_name='root',
-			is_visible=True,
-			parent=None,
-			xpath='',
-			attributes={},
-			children=[],
-		),
-		selector_map={},
-	)
-
-
-@pytest.fixture
-def action_registry():
-	registry = Registry()
-
-	# Register the actions we need for testing
-	@registry.action(description='Click an element', param_model=ClickElementAction)
-	def click_element(params: ClickElementAction, browser=None):
-		pass
-
-	@registry.action(
-		description='Extract page content',
-		param_model=ExtractPageContentAction,
-	)
-	def extract_page_content(params: ExtractPageContentAction, browser=None):
-		pass
-
-	@registry.action(description='Mark task as done', param_model=DoneAction)
-	def done(params: DoneAction):
-		pass
-
-	# Create the dynamic ActionModel with all registered actions
-	return registry.create_action_model()
-
-
-@pytest.fixture
-def sample_history(action_registry):
-	# Create actions with nested params structure
-	click_action = action_registry(click_element={'index': 1})
-
-	extract_action = action_registry(extract_page_content={'value': 'text'})
-
-	done_action = action_registry(done={'text': 'Task completed'})
-
-	histories = [
-		AgentHistory(
-			model_output=AgentOutput(
-				current_state=AgentBrain(
-					evaluation_previous_goal='None',
-					memory='Started task',
-					next_goal='Click button',
-				),
-				action=[click_action],
-			),
-			result=[ActionResult(is_done=False)],
-			state=BrowserStateHistory(
-				url='https://example.com',
-				title='Page 1',
-				tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)],
-				screenshot='screenshot1.png',
-				interacted_element=[{'xpath': '//button[1]'}],
-			),
-		),
-		AgentHistory(
-			model_output=AgentOutput(
-				current_state=AgentBrain(
-					evaluation_previous_goal='Clicked button',
-					memory='Button clicked',
-					next_goal='Extract content',
-				),
-				action=[extract_action],
-			),
-			result=[
-				ActionResult(
-					is_done=False,
-					extracted_content='Extracted text',
-					error='Failed to extract completely',
-				)
-			],
-			state=BrowserStateHistory(
-				url='https://example.com/page2',
-				title='Page 2',
-				tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
-				screenshot='screenshot2.png',
-				interacted_element=[{'xpath': '//div[1]'}],
-			),
-		),
-		AgentHistory(
-			model_output=AgentOutput(
-				current_state=AgentBrain(
-					evaluation_previous_goal='Extracted content',
-					memory='Content extracted',
-					next_goal='Finish task',
-				),
-				action=[done_action],
-			),
-			result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)],
-			state=BrowserStateHistory(
-				url='https://example.com/page2',
-				title='Page 2',
-				tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
-				screenshot='screenshot3.png',
-				interacted_element=[{'xpath': '//div[1]'}],
-			),
-		),
-	]
-	return AgentHistoryList(history=histories)
-
-
-def test_last_model_output(sample_history: AgentHistoryList):
-	last_output = sample_history.last_action()
-	print(last_output)
-	assert last_output == {'done': {'text': 'Task completed'}}
-
-
-def test_get_errors(sample_history: AgentHistoryList):
-	errors = sample_history.errors()
-	assert len(errors) == 1
-	assert errors[0] == 'Failed to extract completely'
-
-
-def test_final_result(sample_history: AgentHistoryList):
-	assert sample_history.final_result() == 'Task completed'
-
-
-def test_is_done(sample_history: AgentHistoryList):
-	assert sample_history.is_done() is True
-
-
-def test_urls(sample_history: AgentHistoryList):
-	urls = sample_history.urls()
-	assert 'https://example.com' in urls
-	assert 'https://example.com/page2' in urls
-
-
-def test_all_screenshots(sample_history: AgentHistoryList):
-	screenshots = sample_history.screenshots()
-	assert len(screenshots) == 3
-	assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png']
-
-
-def test_all_model_outputs(sample_history: AgentHistoryList):
-	outputs = sample_history.model_actions()
-	print(f'DEBUG: {outputs[0]}')
-	assert len(outputs) == 3
-	# get first key value pair
-	assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}}
-	assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}}
-	assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}}
-
-
-def test_all_model_outputs_filtered(sample_history: AgentHistoryList):
-	filtered = sample_history.model_actions_filtered(include=['click_element'])
-	assert len(filtered) == 1
-	assert filtered[0]['click_element']['index'] == 1
-
-
-def test_empty_history():
-	empty_history = AgentHistoryList(history=[])
-	assert empty_history.last_action() is None
-	assert empty_history.final_result() is None
-	assert empty_history.is_done() is False
-	assert len(empty_history.urls()) == 0
-
-
-# Add a test to verify action creation
-def test_action_creation(action_registry):
-	click_action = action_registry(click_element={'index': 1})
-
-	assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}}
-
-
-# run this with:
-# pytest browser_use/agent/tests.py
diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py
index fd8461967..582ecbf73 100644
--- a/browser_use/browser/profile.py
+++ b/browser_use/browser/profile.py
@@ -1,4 +1,3 @@
-import os
 import sys
 from collections.abc import Iterable
 from enum import Enum
@@ -12,9 +11,9 @@ from pydantic import AfterValidator, AliasChoices, BaseModel, ConfigDict, Field,
 from uuid_extensions import uuid7str
 
 from browser_use.browser.types import ClientCertificate, Geolocation, HttpCredentials, ProxySettings, ViewportSize
+from browser_use.config import CONFIG
 from browser_use.utils import _log_pretty_path, logger
 
-IN_DOCKER = os.environ.get('IN_DOCKER', 'false').lower()[0] in 'ty1'
 CHROME_DEBUG_PORT = 9242  # use a non-default port to avoid conflicts with other tools / devs using 9222
 CHROME_DISABLED_COMPONENTS = [
 	# Playwright defaults: https://github.com/microsoft/playwright/blob/41008eeddd020e2dee1c540f7c0cdfa337e99637/packages/playwright-core/src/server/chromium/chromiumSwitches.ts#L76
@@ -286,9 +285,7 @@ class BrowserChannel(str, Enum):
 	MSEDGE_CANARY = 'msedge-canary'
 
 
-BROWSERUSE_CONFIG_DIR = Path('~/.config/browseruse').expanduser().resolve()
-BROWSERUSE_PROFILES_DIR = BROWSERUSE_CONFIG_DIR / 'profiles'
-BROWSERUSE_CHROMIUM_USER_DATA_DIR = BROWSERUSE_PROFILES_DIR / 'default'
+# Using constants from central location in browser_use.config
 BROWSERUSE_DEFAULT_CHANNEL = BrowserChannel.CHROMIUM
 
 
@@ -420,7 +417,7 @@ class BrowserLaunchArgs(BaseModel):
 	)
 	channel: BrowserChannel | None = None  # https://playwright.dev/docs/browsers#chromium-headless-shell
 	chromium_sandbox: bool = Field(
-		default=not IN_DOCKER, description='Whether to enable Chromium sandboxing (recommended unless inside Docker).'
+		default=not CONFIG.IN_DOCKER, description='Whether to enable Chromium sandboxing (recommended unless inside Docker).'
 	)
 	devtools: bool = Field(
 		default=False, description='Whether to open DevTools panel automatically for every page, only works when headless=False.'
@@ -519,7 +516,7 @@ class BrowserLaunchPersistentContextArgs(BrowserLaunchArgs, BrowserContextArgs):
 	model_config = ConfigDict(extra='ignore', validate_assignment=False, revalidate_instances='always')
 
 	# Required parameter specific to launch_persistent_context, but can be None to use incognito temp dir
-	user_data_dir: str | Path | None = BROWSERUSE_CHROMIUM_USER_DATA_DIR
+	user_data_dir: str | Path | None = CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR
 
 
 class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, BrowserLaunchArgs, BrowserNewContextArgs):
@@ -647,7 +644,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
 		"""
 
 		is_not_using_default_chromium = self.executable_path or self.channel not in (BROWSERUSE_DEFAULT_CHANNEL, None)
-		if self.user_data_dir == BROWSERUSE_CHROMIUM_USER_DATA_DIR and is_not_using_default_chromium:
+		if self.user_data_dir == CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR and is_not_using_default_chromium:
 			alternate_name = (
 				Path(self.executable_path).name.lower().replace(' ', '-')
 				if self.executable_path
@@ -658,7 +655,16 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
 			logger.warning(
 				f'⚠️ {self} Changing user_data_dir= {_log_pretty_path(self.user_data_dir)} ➡️ .../default-{alternate_name} to avoid {alternate_name.upper()} corruping default profile created by {BROWSERUSE_DEFAULT_CHANNEL.name}'
 			)
-			self.user_data_dir = BROWSERUSE_CHROMIUM_USER_DATA_DIR.parent / f'default-{alternate_name}'
+			self.user_data_dir = CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR.parent / f'default-{alternate_name}'
+		return self
+
+	@model_validator(mode='after')
+	def warn_deterministic_rendering_weirdness(self) -> Self:
+		if self.deterministic_rendering:
+			logger.warning(
+				'⚠️ BrowserSession(deterministic_rendering=True) is NOT RECOMMENDED. It breaks many sites and increases chances of getting blocked by anti-bot systems. '
+				'It hardcodes the JS random seed and forces browsers across Linux/Mac/Windows to use the same font rendering engine so that identical screenshots can be generated.'
+			)
 		return self
 
 	def get_args(self) -> list[str]:
@@ -676,7 +682,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
 			*default_args,
 			*self.args,
 			f'--profile-directory={self.profile_directory}',
-			*(CHROME_DOCKER_ARGS if IN_DOCKER else []),
+			*(CHROME_DOCKER_ARGS if CONFIG.IN_DOCKER else []),
 			*(CHROME_HEADLESS_ARGS if self.headless else []),
 			*(CHROME_DISABLE_SECURITY_ARGS if self.disable_security else []),
 			*(CHROME_DETERMINISTIC_RENDERING_ARGS if self.deterministic_rendering else []),
diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
index e64179203..6b0f7a645 100644
--- a/browser_use/browser/session.py
+++ b/browser_use/browser/session.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
 import asyncio
+import atexit
 import base64
 import json
 import logging
 import os
 import re
+import shutil
+import tempfile
 import time
 from dataclasses import dataclass
 from functools import wraps
@@ -13,6 +16,7 @@ from pathlib import Path
 from typing import Any, Self
 from urllib.parse import urlparse
 
+from browser_use.config import CONFIG
 from browser_use.utils import _log_pretty_path, _log_pretty_url
 
 os.environ['PW_TEST_SCREENSHOT_NO_FONTS_READY'] = '1'  # https://github.com/microsoft/playwright/issues/35972
@@ -45,10 +49,6 @@ from browser_use.dom.service import DomService
 from browser_use.dom.views import DOMElementNode, SelectorMap
 from browser_use.utils import match_url_with_domain_pattern, merge_dicts, time_execution_async, time_execution_sync
 
-# Check if running in Docker
-IN_DOCKER = os.environ.get('IN_DOCKER', 'false').lower()[0] in 'ty1'
-
-
 _GLOB_WARNING_SHOWN = False  # used inside _is_url_allowed to avoid spamming the logs with the same warning multiple times
 
 GLOBAL_PLAYWRIGHT_API_OBJECT = None  # never instantiate the playwright API object more than once per thread
@@ -83,7 +83,7 @@ def require_initialization(func):
 
 			if not self.agent_current_page or self.agent_current_page.is_closed():
 				self.agent_current_page = (
-					self.browser_context.pages[0] if (self.browser_context and self.browser_context.pages) else None
+					self.browser_context.pages[0] if (self.browser_context and len(self.browser_context.pages) > 0) else None
 				)
 
 			if not self.agent_current_page or self.agent_current_page.is_closed():
@@ -260,13 +260,13 @@ class BrowserSession(BaseModel):
 		"""
 		Starts the browser session by either connecting to an existing browser or launching a new one.
 		Precedence order for launching/connecting:
-		        1. page=Page playwright object, will use its page.context as browser_context
-		        2. browser_context=PlaywrightBrowserContext object, will use its browser
-		        3. browser=PlaywrightBrowser object, will use its first available context
-		        4. browser_pid=int, will connect to a local chromium-based browser via pid
-		        5. wss_url=str, will connect to a remote playwright browser server via WSS
-		        6. cdp_url=str, will connect to a remote chromium-based browser via CDP
-		        7. playwright=Playwright object, will use its chromium instance to launch a new browser
+			1. page=Page playwright object, will use its page.context as browser_context
+			2. browser_context=PlaywrightBrowserContext object, will use its browser
+			3. browser=PlaywrightBrowser object, will use its first available context
+			4. browser_pid=int, will connect to a local chromium-based browser via pid
+			5. wss_url=str, will connect to a remote playwright browser server via WSS
+			6. cdp_url=str, will connect to a remote chromium-based browser via CDP
+			7. playwright=Playwright object, will use its chromium instance to launch a new browser
 		"""
 
 		# if we're already initialized and the connection is still valid, return the existing session state and start from scratch
@@ -276,7 +276,7 @@ class BrowserSession(BaseModel):
 		async with asyncio.timeout(60):  # 60 second overall timeout for entire launching process to avoid deadlocks
 			async with self._start_lock:  # prevent parallel calls to start() / stop() / save_storage_state() from clashing
 				if self.initialized:
-					if self.is_connected():
+					if await self.is_connected():
 						return self
 					else:
 						next_step = (
@@ -353,7 +353,7 @@ class BrowserSession(BaseModel):
 			async with self._start_lock:
 				# save cookies to disk if cookies_file or storage_state is configured
 				# but only if the browser context is still connected
-				if self.is_connected():
+				if await self.is_connected():
 					try:
 						await asyncio.wait_for(self.save_storage_state(), timeout=5)
 					except Exception as e:
@@ -414,9 +414,10 @@ class BrowserSession(BaseModel):
 						except TimeoutError:
 							self.logger.warning('⏱️ Timeout while closing browser/context, has it become unresponsive?')
 					except Exception as e:
-						self.logger.warning(
-							f'❌ Error closing playwright browser_context={self.browser_context}: {type(e).__name__}: {e}'
-						)
+						if 'browser has been closed' not in str(e):
+							self.logger.warning(
+								f'❌ Error closing playwright browser_context={self.browser_context}: {type(e).__name__}: {e}'
+							)
 					finally:
 						# Always clear references to ensure a fresh start next time
 						self.browser_context = None
@@ -426,7 +427,8 @@ class BrowserSession(BaseModel):
 				if self.browser_pid:
 					try:
 						proc = psutil.Process(pid=self.browser_pid)
-						executable_path = proc.cmdline()[0]
+						cmdline = proc.cmdline()
+						executable_path = cmdline[0] if cmdline else 'unknown'
 						self.logger.info(f' ↳ Killing browser_pid={self.browser_pid} {_log_pretty_path(executable_path)}')
 						# Add timeout for process termination
 						try:
@@ -440,12 +442,20 @@ class BrowserSession(BaseModel):
 							)
 							proc.kill()  # Force kill if terminate didn't work
 						self.browser_pid = None
+					except psutil.NoSuchProcess:
+						self.browser_pid = None
 					except Exception as e:
 						if 'NoSuchProcess' not in type(e).__name__:
 							self.logger.debug(
 								f'❌ Error terminating subprocess with browser_pid={self.browser_pid}: {type(e).__name__}: {e}'
 							)
 
+				# if the user_data_dir is a temporary one, delete it
+				if self.browser_profile.user_data_dir and Path(self.browser_profile.user_data_dir).name.startswith(
+					'browseruse-tmp'
+				):
+					shutil.rmtree(self.browser_profile.user_data_dir, ignore_errors=True)
+
 				self._reset_connection_state()
 				# self.logger.debug('🛑 Shutdown complete.')
 
@@ -461,31 +471,8 @@ class BrowserSession(BaseModel):
 		self.browser_profile.keep_alive = False
 		await self.stop()
 
-		# Clean up playwright instance to prevent background tasks from running
-		if self.playwright:
-			try:
-				await self.playwright.stop()
-				# Give playwright tasks a moment to clean up properly
-				# This prevents "Task was destroyed but it is pending!" warnings
-				await asyncio.sleep(0.1)
-				# self.logger.debug('🎭 Stopped playwright node.js API worker')
-			except Exception as e:
-				self.logger.warning(f'❌ Error stopping playwright node.js API subprocess: {type(e).__name__}: {e}')
-			finally:
-				# Clear global references if they match this instance
-				global GLOBAL_PLAYWRIGHT_API_OBJECT, GLOBAL_PATCHRIGHT_API_OBJECT
-				global GLOBAL_PLAYWRIGHT_EVENT_LOOP, GLOBAL_PATCHRIGHT_EVENT_LOOP
-
-				if self.playwright == GLOBAL_PLAYWRIGHT_API_OBJECT:
-					GLOBAL_PLAYWRIGHT_API_OBJECT = None
-					GLOBAL_PLAYWRIGHT_EVENT_LOOP = None
-					# self.logger.debug('🧹 Cleared global playwright references')
-				elif self.playwright == GLOBAL_PATCHRIGHT_API_OBJECT:
-					GLOBAL_PATCHRIGHT_API_OBJECT = None
-					GLOBAL_PATCHRIGHT_EVENT_LOOP = None
-					# self.logger.debug('🧹 Cleared global patchright references')
-
-				self.playwright = None
+		# do not stop self.playwright here as its likely used by other parallel browser_sessions
+		# let it be cleaned up by the garbage collector when no refs use it anymore
 
 	async def new_context(self, **kwargs):
 		"""Deprecated: Provides backwards-compatibility with old class method Browser().new_context()."""
@@ -578,56 +565,6 @@ class BrowserSession(BaseModel):
 			GLOBAL_PLAYWRIGHT_EVENT_LOOP = current_loop
 			return GLOBAL_PLAYWRIGHT_API_OBJECT
 
-	def _kill_child_processes(self) -> None:
-		"""Kill any child processes that might be related to the browser"""
-
-		if not self.browser_profile.keep_alive and self.browser_pid:
-			try:
-				browser_proc = psutil.Process(self.browser_pid)
-				try:
-					browser_proc.terminate()
-					browser_proc.wait(
-						timeout=5
-					)  # wait up to 5 seconds for the process to exit cleanly and commit its user_data_dir changes
-				except (psutil.NoSuchProcess, psutil.AccessDenied, TimeoutError):
-					pass
-
-				# Kill all child processes first (recursive)
-				for child in browser_proc.children(recursive=True):
-					try:
-						# self.logger.debug(f'Force killing child process: {child.pid} ({child.name()})')
-						child.kill()
-					except (psutil.NoSuchProcess, psutil.AccessDenied):
-						pass
-
-				# Kill the main browser process
-				# self.logger.debug(f'Force killing browser process: {self.browser_pid}')
-				browser_proc.kill()
-			except psutil.NoSuchProcess:
-				pass
-			except Exception as e:
-				self.logger.warning(f'Error force-killing browser in BrowserSession.__del__: {type(e).__name__}: {e}')
-
-	@staticmethod
-	async def _start_global_playwright_subprocess(is_stealth: bool) -> PlaywrightOrPatchright:
-		"""Create and return a new playwright or patchright node.js subprocess / API connector"""
-		global GLOBAL_PLAYWRIGHT_API_OBJECT, GLOBAL_PATCHRIGHT_API_OBJECT
-		global GLOBAL_PLAYWRIGHT_EVENT_LOOP, GLOBAL_PATCHRIGHT_EVENT_LOOP
-
-		try:
-			current_loop = asyncio.get_running_loop()
-		except RuntimeError:
-			current_loop = None
-
-		if is_stealth:
-			GLOBAL_PATCHRIGHT_API_OBJECT = await async_patchright().start()
-			GLOBAL_PATCHRIGHT_EVENT_LOOP = current_loop
-			return GLOBAL_PATCHRIGHT_API_OBJECT
-		else:
-			GLOBAL_PLAYWRIGHT_API_OBJECT = await async_playwright().start()
-			GLOBAL_PLAYWRIGHT_EVENT_LOOP = current_loop
-			return GLOBAL_PLAYWRIGHT_API_OBJECT
-
 	async def setup_playwright(self) -> None:
 		"""
 		Set up playwright library client object: usually the result of (await async_playwright().start())
@@ -694,6 +631,22 @@ class BrowserSession(BaseModel):
 			if self.browser_profile.headless or not self.browser_profile.no_viewport:
 				self.logger.info(' 🪄 For maximum stealth, BrowserSession(...) should be passed headless=False & viewport=None')
 
+		# register a shutdown hook to stop the shared global playwright node.js client when the program exits (if an event loop is still running)
+		def shudown_playwright():
+			if not self.playwright:
+				return
+			try:
+				loop = asyncio.get_running_loop()
+				self.logger.debug('🛑 Shutting down shared global playwright node.js client')
+				task = loop.create_task(self.playwright.stop())
+				if hasattr(task, '_log_destroy_pending'):
+					task._log_destroy_pending = False  # type: ignore
+			except Exception:
+				pass
+			self.playwright = None
+
+		atexit.register(shudown_playwright)
+
 	async def setup_browser_via_passed_objects(self) -> None:
 		"""Override to customize the set up of the connection to an existing browser"""
 
@@ -735,14 +688,36 @@ class BrowserSession(BaseModel):
 		if not self.browser_pid:
 			return  # no browser_pid provided, nothing to do
 
-		chrome_process = psutil.Process(pid=self.browser_pid)
-		assert chrome_process.is_running(), 'Chrome process is not running'
-		args = chrome_process.cmdline()
+		# check that browser_pid process is running, otherwise we cannot connect to it
+		try:
+			chrome_process = psutil.Process(pid=self.browser_pid)
+			if not chrome_process.is_running():
+				self.logger.warning(f'Chrome process with pid={self.browser_pid} is not running')
+				return
+			args = chrome_process.cmdline()
+		except psutil.NoSuchProcess:
+			self.logger.warning(f'Chrome process with pid={self.browser_pid} not found')
+			return
+		except Exception as e:
+			self.browser_pid = None
+			self.logger.warning(f'Error accessing chrome process with pid={self.browser_pid}: {type(e).__name__}: {e}')
+			return
+
+		# check that browser_pid process is exposing a debug port we can connect to, otherwise we cannot connect to it
 		debug_port = next((arg for arg in args if arg.startswith('--remote-debugging-port=')), '').split('=')[-1].strip()
-		assert debug_port, (
-			f'Could not find --remote-debugging-port=... to connect to in browser launch args: browser_pid={self.browser_pid} {args}'
-		)
-		# we could automatically relaunch the browser process with that arg added here, but they may have tabs open they dont want to lose
+		if not debug_port:
+			# provided pid is unusable, it's either not running or doesnt have an open debug port we can connect to
+			if '--remote-debugging-pipe' in args:
+				self.logger.error(
+					f'❌ Found --remote-debugging-pipe in browser launch args for browser_pid={self.browser_pid} but it was started by a different BrowserSession, cannot connect to it'
+				)
+			else:
+				self.logger.error(
+					f'❌ Could not find --remote-debugging-port=... to connect to in browser launch args for browser_pid={self.browser_pid}: {" ".join(args)}'
+				)
+			self.browser_pid = None
+			return
+
 		self.cdp_url = self.cdp_url or f'http://localhost:{debug_port}/'
 		self.logger.info(f'🌎 Connecting to existing local browser process: browser_pid={self.browser_pid} on {self.cdp_url}')
 		assert self.playwright is not None, 'playwright instance is None'
@@ -815,112 +790,86 @@ class BrowserSession(BaseModel):
 				f'{str(type(self.playwright).__module__).split(".")[0]}:{self.browser_profile.channel.name.lower()} keep_alive={self.browser_profile.keep_alive or False} '
 				f'user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir) or "<incognito>"}'
 			)
+
+			# if no user_data_dir is provided, generate a unique one for this temporary browser_context (will be used to uniquely identify the browser_pid later)
 			if not self.browser_profile.user_data_dir:
 				# self.logger.debug('🌎 Launching local browser in incognito mode')
-				# if no user_data_dir is provided, launch an incognito context with no persistent user_data_dir
-				try:
-					assert self.playwright is not None, 'playwright instance is None'
-					async with asyncio.timeout(10):  # Reduced timeout from 30s to 10s
-						self.browser = self.browser or await self.playwright.chromium.launch(
-							**self.browser_profile.kwargs_for_launch().model_dump()
-						)
-					# self.logger.debug('🌎 Launching new incognito context in browser')
-					async with asyncio.timeout(10):  # Reduced timeout from 30s to 10s
-						self.browser_context = await self.browser.new_context(
-							**self.browser_profile.kwargs_for_new_context().model_dump(mode='json')
-						)
-				except TimeoutError:
-					self.logger.warning(
-						'Browser operation timed out. This may indicate the playwright instance is invalid due to event loop changes. '
-						'Recreating playwright instance and retrying...'
-					)
-					# Force recreation of the playwright object
-					self.playwright = await self._start_global_playwright_subprocess(is_stealth=self.browser_profile.stealth)
-					# Retry the operation with the new playwright instance
-					assert self.playwright is not None, 'playwright instance is None'
-					async with asyncio.timeout(10):
-						self.browser = await self.playwright.chromium.launch(
-							**self.browser_profile.kwargs_for_launch().model_dump()
-						)
-					async with asyncio.timeout(10):
-						self.browser_context = await self.browser.new_context(
-							**self.browser_profile.kwargs_for_new_context().model_dump()
-						)
-				# self.logger.debug('🌎 Created new incognito context in browser')
-			else:
-				# user data dir was provided, prepare it for use
-				self.prepare_user_data_dir()
+				# if no user_data_dir is provided, generate a unique one for this temporary browser_context (will be used to uniquely identify the browser_pid later)
+				self.browser_profile.user_data_dir = self.browser_profile.user_data_dir or Path(
+					tempfile.mkdtemp(prefix='browseruse-tmp-')
+				)
 
-				# search for potentially conflicting local processes running on the same user_data_dir
-				for proc in psutil.process_iter(['pid', 'cmdline']):
-					if f'--user-data-dir={self.browser_profile.user_data_dir}' in (proc.info['cmdline'] or []):
-						self.logger.error(
-							f'🚨 Found potentially conflicting browser process browser_pid={proc.info["pid"]} '
-							f'already running with the same user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)}'
-						)
-						break
+			# user data dir was provided, prepare it for use
+			self.prepare_user_data_dir()
 
-				# if a user_data_dir is provided, launch a persistent context with that user_data_dir
-				try:
-					async with asyncio.timeout(10):  # Reduced timeout from 30s to 10s
-						try:
-							assert self.playwright is not None, 'playwright instance is None'
-							self.browser_context = await self.playwright.chromium.launch_persistent_context(
-								**self.browser_profile.kwargs_for_launch_persistent_context().model_dump(mode='json')
-							)
-						except Exception as e:
-							# Re-raise if not a timeout
-							if not isinstance(e, asyncio.TimeoutError):
-								raise
-				except TimeoutError:
-					self.logger.warning(
-						'Browser operation timed out. This may indicate the playwright instance is invalid due to event loop changes. '
-						'Recreating playwright instance and retrying...'
+			# search for potentially conflicting local processes running on the same user_data_dir
+			for proc in psutil.process_iter(['pid', 'cmdline']):
+				if f'--user-data-dir={self.browser_profile.user_data_dir}' in (proc.info['cmdline'] or []):
+					self.logger.error(
+						f'🚨 Found potentially conflicting browser process browser_pid={proc.info["pid"]} '
+						f'already running with the same user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)}'
 					)
-					# Force recreation of the playwright object
-					self.playwright = await self._start_global_playwright_subprocess(is_stealth=self.browser_profile.stealth)
-					# Retry the operation with the new playwright instance
-					async with asyncio.timeout(10):
+					break
+
+			# if a user_data_dir is provided, launch a persistent context with that user_data_dir
+			try:
+				async with asyncio.timeout(self.browser_profile.timeout / 1000):
+					try:
 						assert self.playwright is not None, 'playwright instance is None'
 						self.browser_context = await self.playwright.chromium.launch_persistent_context(
-							**self.browser_profile.kwargs_for_launch_persistent_context().model_dump()
+							**self.browser_profile.kwargs_for_launch_persistent_context().model_dump(mode='json')
 						)
-				except Exception as e:
-					# show a nice logger hint explaining what went wrong with the user_data_dir
-					# calculate the version of the browser that the user_data_dir is for, and the version of the browser we are running with
-					user_data_dir_chrome_version = '???'
-					test_browser_version = '???'
-					try:
-						# user_data_dir is corrupted or unreadable because it was migrated to a newer version of chrome than we are running with
-						user_data_dir_chrome_version = (
-							(Path(self.browser_profile.user_data_dir) / 'Last Version').read_text().strip()
-						)
-					except Exception:
-						pass  # let the logger below handle it
-					try:
-						assert self.playwright is not None, 'playwright instance is None'
-						test_browser = await self.playwright.chromium.launch(headless=True)
-						test_browser_version = test_browser.version
-						await test_browser.close()
-					except Exception:
-						pass
+					except Exception as e:
+						# Re-raise if not a timeout
+						if not isinstance(e, asyncio.TimeoutError):
+							raise
+			except TimeoutError:
+				self.logger.warning(
+					'Browser operation timed out. This may indicate the playwright instance is invalid due to event loop changes. '
+					'Recreating playwright instance and retrying...'
+				)
+				# Force recreation of the playwright object
+				self.playwright = await self._start_global_playwright_subprocess(is_stealth=self.browser_profile.stealth)
+				# Retry the operation with the new playwright instance
+				async with asyncio.timeout(self.browser_profile.timeout / 1000):
+					assert self.playwright is not None, 'playwright instance is None'
+					self.browser_context = await self.playwright.chromium.launch_persistent_context(
+						**self.browser_profile.kwargs_for_launch_persistent_context().model_dump()
+					)
+			except Exception as e:
+				# show a nice logger hint explaining what went wrong with the user_data_dir
+				# calculate the version of the browser that the user_data_dir is for, and the version of the browser we are running with
+				user_data_dir_chrome_version = '???'
+				test_browser_version = '???'
+				try:
+					# user_data_dir is corrupted or unreadable because it was migrated to a newer version of chrome than we are running with
+					user_data_dir_chrome_version = (Path(self.browser_profile.user_data_dir) / 'Last Version').read_text().strip()
+				except Exception:
+					pass  # let the logger below handle it
+				try:
+					assert self.playwright is not None, 'playwright instance is None'
+					test_browser = await self.playwright.chromium.launch(headless=True)
+					test_browser_version = test_browser.version
+					await test_browser.close()
+				except Exception:
+					pass
 
-					# failed to parse extensions == most common error text when user_data_dir is corrupted / has an unusable schema
-					reason = 'due to bad' if 'Failed parsing extensions' in str(e) else 'for unknown reason with'
-					driver = str(type(self.playwright).__module__).split('.')[0].lower()
-					browser_channel = (
-						Path(self.browser_profile.executable_path).name.replace(' ', '-').replace('.exe', '').lower()
-						if self.browser_profile.executable_path
-						else (self.browser_profile.channel or BROWSERUSE_DEFAULT_CHANNEL).name.lower()
-					)
-					self.logger.error(
-						f'❌ Launching new local browser {driver}:{browser_channel} (v{test_browser_version}) failed!'
-						f'\n\tFailed {reason} user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)} (created with v{user_data_dir_chrome_version})'
-						'\n\tTry using a different browser version/channel or delete the user_data_dir to start over with a fresh profile.'
-						'\n\t(can happen if different versions of Chrome/Chromium/Brave/etc. tried to share one dir)'
-						f'\n\n{type(e).__name__} {e}'
-					)
-					raise
+				# failed to parse extensions == most common error text when user_data_dir is corrupted / has an unusable schema
+				reason = 'due to bad' if 'Failed parsing extensions' in str(e) else 'for unknown reason with'
+				driver = str(type(self.playwright).__module__).split('.')[0].lower()
+				browser_channel = (
+					Path(self.browser_profile.executable_path).name.replace(' ', '-').replace('.exe', '').lower()
+					if self.browser_profile.executable_path
+					else (self.browser_profile.channel or BROWSERUSE_DEFAULT_CHANNEL).name.lower()
+				)
+				self.logger.error(
+					f'❌ Launching new local browser {driver}:{browser_channel} (v{test_browser_version}) failed!'
+					f'\n\tFailed {reason} user_data_dir= {_log_pretty_path(self.browser_profile.user_data_dir)} (created with v{user_data_dir_chrome_version})'
+					'\n\tTry using a different browser version/channel or delete the user_data_dir to start over with a fresh profile.'
+					'\n\t(can happen if different versions of Chrome/Chromium/Brave/etc. tried to share one dir)'
+					f'\n\n{type(e).__name__} {e}'
+				)
+				raise
 
 		# Only restore browser from context if it's connected, otherwise keep it None to force new launch
 		browser_from_context = self.browser_context and self.browser_context.browser
@@ -930,22 +879,59 @@ class BrowserSession(BaseModel):
 		# playwright does not give us a browser object at all when we use launch_persistent_context()!
 
 		# Detect any new child chrome processes that we might have launched above
-		try:
-			child_pids_after_launch = {child.pid for child in current_process.children(recursive=True)}
-			new_child_pids = child_pids_after_launch - child_pids_before_launch
-			new_child_procs = [psutil.Process(pid) for pid in new_child_pids]
-			new_chrome_procs = [proc for proc in new_child_procs if 'Helper' not in proc.name() and proc.status() == 'running']
-		except Exception as e:
-			self.logger.debug(
-				f'❌ Error trying to find child chrome processes after launching new browser: {type(e).__name__}: {e}'
-			)
-			new_chrome_procs = []
+		def is_our_chrome_proc(pid: int) -> psutil.Process | None:
+			try:
+				proc = psutil.Process(pid)
+				cmdline = proc.cmdline()
+				if 'Helper' in proc.name():
+					return None
+				if proc.status() != 'running':
+					return None
+				if (
+					self.browser_profile.executable_path
+					and Path(cmdline[0]).expanduser().resolve()
+					!= Path(self.browser_profile.executable_path).expanduser().resolve()
+				):
+					# self.logger.debug(f'❌ Found new child chrome process that does not match our executable: {str(cmdline)[:50]}')
+					return None
+				if (
+					self.browser_profile.user_data_dir
+					and f'--user-data-dir={Path(self.browser_profile.user_data_dir).expanduser().resolve()}' in cmdline
+				):
+					# self.logger.debug(f'✅ Found new child chrome process that matches our user_data_dir: {str(cmdline)[:50]}')
+					return proc
+				else:
+					# self.logger.debug(f'❌ Found new child chrome process that does not match our user_data_dir: {[arg for arg in cmdline if "--user-data-dir=" in arg]}')
+					return None
+			except Exception:
+				pass
+			return None
 
-		if new_chrome_procs and not self.browser_pid:
-			self.browser_pid = new_chrome_procs[0].pid
-			self.logger.info(f' ↳ Spawned browser_pid={self.browser_pid} {_log_pretty_path(new_chrome_procs[0].cmdline()[0])}')
-			self.logger.debug(' '.join(new_chrome_procs[0].cmdline()))  # print the entire launch command for debugging
-			self._set_browser_keep_alive(False)  # close the browser at the end because we launched it
+		child_pids_after_launch = {child.pid for child in current_process.children(recursive=True)}
+		new_child_pids = child_pids_after_launch - child_pids_before_launch
+		new_child_procs = list(filter(bool, (is_our_chrome_proc(pid) for pid in new_child_pids)))
+		if not new_child_procs:
+			self.logger.debug(f'❌ Failed to find any new child chrome processes after launching new browser: {new_child_pids}')
+			new_chrome_proc = None
+		elif len(new_child_procs) > 1:
+			self.logger.debug(f'❌ Found multiple new child chrome processes after launching new browser: {new_child_procs}')
+			new_chrome_proc = None
+		else:
+			new_chrome_proc = new_child_procs[0]
+
+		if new_chrome_proc and not self.browser_pid:
+			# look through the discovered new chrome processes to uniquely identify the one that *we* launched,
+			# match using unique user_data_dir
+			try:
+				self.browser_pid = new_chrome_proc.pid
+				cmdline = new_chrome_proc.cmdline()
+				executable_path = cmdline[0] if cmdline else 'unknown'
+				self.logger.info(f' ↳ Spawned browser_pid={self.browser_pid} {_log_pretty_path(executable_path)}')
+				if cmdline:
+					self.logger.debug(' '.join(cmdline))  # print the entire launch command for debugging
+				self._set_browser_keep_alive(False)  # close the browser at the end because we launched it
+			except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
+				self.logger.warning(f'Browser process {self.browser_pid} died immediately after launch: {type(e).__name__}')
 
 		if self.browser:
 			assert self.browser.is_connected(), (
@@ -1071,7 +1057,7 @@ class BrowserSession(BaseModel):
 		if pages:
 			foreground_page = pages[0]
 			self.logger.debug(
-				f'👁️‍🗨️ Found {len(pages)} existing tabs in browser, agent session {self.id[-4:]}.{str(id(self.agent_current_page))[-2:]} will start focused on Tab [{pages.index(foreground_page)}]: {foreground_page.url}'
+				f'👁️‍🗨️ Found {len(pages)} existing tabs in browser, agent session {self.id[-4:]}.{str(id(self.agent_current_page))[-2:]} will start focused on Tab [{pages.index(foreground_page)}]: {foreground_page.url}'  # type: ignore
 			)
 		else:
 			foreground_page = await self.browser_context.new_page()
@@ -1090,15 +1076,15 @@ class BrowserSession(BaseModel):
 			old_foreground = self.human_current_page
 			assert self.browser_context is not None, 'BrowserContext object is not set'
 			assert old_foreground is not None, 'Old foreground page is not set'
-			old_tab_idx = self.browser_context.pages.index(old_foreground)
+			old_tab_idx = self.browser_context.pages.index(old_foreground)  # type: ignore
 			self.human_current_page = new_page
-			new_tab_idx = self.browser_context.pages.index(new_page)
+			new_tab_idx = self.browser_context.pages.index(new_page)  # type: ignore
 
 			# Log before and after for debugging
 			old_url = old_foreground and old_foreground.url or 'about:blank'
 			new_url = new_page and new_page.url or 'about:blank'
 			agent_url = self.agent_current_page and self.agent_current_page.url or 'about:blank'
-			agent_tab_idx = self.browser_context.pages.index(self.agent_current_page)
+			agent_tab_idx = self.browser_context.pages.index(self.agent_current_page)  # type: ignore
 			if old_url != new_url:
 				self.logger.info(
 					f'👁️ Foregound tab changed by human from [{old_tab_idx}]{_log_pretty_url(old_url)} '
@@ -1167,7 +1153,7 @@ class BrowserSession(BaseModel):
 				await page.evaluate(update_tab_focus_script)
 				# self.logger.debug(f'👁️ Added visibility listener to existing tab: {page.url}')
 			except Exception as e:
-				page_idx = self.browser_context.pages.index(page)
+				page_idx = self.browser_context.pages.index(page)  # type: ignore
 				self.logger.debug(
 					f'⚠️ Failed to add visibility listener to existing tab, is it crashed or ignoring CDP commands?: [{page_idx}]{page.url}: {type(e).__name__}: {e}'
 				)
@@ -1258,7 +1244,7 @@ class BrowserSession(BaseModel):
 
 			# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
 			try:
-				cdp_session = await page.context.new_cdp_session(page)
+				cdp_session = await page.context.new_cdp_session(page)  # type: ignore
 				window_id_result = await cdp_session.send('Browser.getWindowForTarget')
 				await cdp_session.send(
 					'Browser.setWindowBounds',
@@ -1277,7 +1263,7 @@ class BrowserSession(BaseModel):
 					# fallback to javascript resize if cdp setWindowBounds fails
 					await page.evaluate(
 						"""(width, height) => {window.resizeTo(width, height)}""",
-						**self.browser_profile.window_size,
+						[self.browser_profile.window_size['width'], self.browser_profile.window_size['height']],
 					)
 					return
 				except Exception:
@@ -1292,7 +1278,7 @@ class BrowserSession(BaseModel):
 		if self.browser_profile.keep_alive is None:
 			self.browser_profile.keep_alive = keep_alive
 
-	def is_connected(self) -> bool:
+	async def is_connected(self) -> bool:
 		"""
 		Check if the browser session has valid, connected browser and context objects.
 		Returns False if any of the following conditions are met:
@@ -1309,8 +1295,14 @@ class BrowserSession(BaseModel):
 
 		# Check if the browser_context itself is closed/unusable
 		try:
-			_ = self.browser_context.pages
-			return True
+			# TODO: figure out a better synchronous test for whether browser_context is usable
+			# this is a hacky workaround for the fact that playwright's browser_context has no is_connected() method
+			# and browser_context.browser is None when we launch with a persistent context (basically always)
+			if self.browser_context.pages:
+				return True
+			else:
+				await self.create_new_tab()
+				return True
 		except Exception:
 			return False
 
@@ -1435,7 +1427,7 @@ class BrowserSession(BaseModel):
 				self.agent_current_page = first_available_tab
 				self.human_current_page = first_available_tab
 			else:
-				# if all tabs are closed, open a new one
+				# if all tabs are closed, open a new one, never allow a context with 0 tabs
 				new_tab = await self.create_new_tab()
 				self.agent_current_page = new_tab
 				self.human_current_page = new_tab
@@ -1480,30 +1472,25 @@ class BrowserSession(BaseModel):
 		"""
 		page = await self.get_current_page()
 		try:
-			script = """
-                try {
-                    // Remove the highlight container and all its contents
-                    const container = document.getElementById('playwright-highlight-container');
-                    if (container) {
-                        container.remove();
-                    }
-
-                    // Remove highlight attributes from elements
-                    const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]');
-                    highlightedElements.forEach(el => {
-                        el.removeAttribute('browser-user-highlight-id');
-                    });
-                } catch (e) {
-                    console.error('Failed to remove highlights:', e);
-                }
-                """
-
-			await page.evaluate(script)
-
-			for iframe in page.frames:
-				if iframe.url and iframe.url != page.url and not iframe.url.startswith('data:'):
-					await iframe.evaluate(script)
+			await page.evaluate(
+				"""
+				try {
+					// Remove the highlight container and all its contents
+					const container = document.getElementById('playwright-highlight-container');
+					if (container) {
+						container.remove();
+					}
 
+					// Remove highlight attributes from elements
+					const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]');
+					highlightedElements.forEach(el => {
+						el.removeAttribute('browser-user-highlight-id');
+					});
+				} catch (e) {
+					console.error('Failed to remove highlights:', e);
+				}
+				"""
+			)
 		except Exception as e:
 			self.logger.debug(f'⚠️ Failed to remove highlights (this is usually ok): {type(e).__name__}: {e}')
 			# Don't raise the error since this is not critical functionality
@@ -1650,6 +1637,8 @@ class BrowserSession(BaseModel):
 			page = await self.get_current_page()
 		else:
 			# otherwise close the tab at the given index
+			if tab_index >= len(pages) or tab_index < 0:
+				raise IndexError(f'Tab index {tab_index} out of range. Available tabs: {len(pages)}')
 			page = pages[tab_index]
 
 		await page.close()
@@ -2336,9 +2325,9 @@ class BrowserSession(BaseModel):
 		Parameters:
 		-----------
 		cache_clickable_elements_hashes: bool
-		        If True, cache the clickable elements hashes for the current state.
-		        This is used to calculate which elements are new to the LLM since the last message,
-		        which helps reduce token usage.
+			If True, cache the clickable elements hashes for the current state.
+			This is used to calculate which elements are new to the LLM since the last message,
+			which helps reduce token usage.
 		"""
 		await self._wait_for_page_and_frames_load()
 		updated_state = await self._get_updated_state()
@@ -2621,10 +2610,10 @@ class BrowserSession(BaseModel):
 		Creates a CSS selector for a DOM element, handling various edge cases and special characters.
 
 		Args:
-		                element: The DOM element to create a selector for
+						element: The DOM element to create a selector for
 
 		Returns:
-		                A valid CSS selector string
+						A valid CSS selector string
 		"""
 		try:
 			# Get base selector from XPath
@@ -2919,10 +2908,6 @@ class BrowserSession(BaseModel):
 		Handles different types of input fields and ensures proper element state before input.
 		"""
 		try:
-			# Highlight before typing
-			# if element_node.highlight_index is not None:
-			# 	await self._update_state(focus_element=element_node.highlight_index)
-
 			element_handle = await self.get_locate_element(element_node)
 
 			if element_handle is None:
@@ -2937,6 +2922,18 @@ class BrowserSession(BaseModel):
 			except Exception:
 				pass
 
+			# let's first try to click and type
+			try:
+				await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}')
+				await element_handle.click()
+				await asyncio.sleep(0.1)  # Increased sleep time
+				page = await self.get_current_page()
+				await page.keyboard.type(text)
+				return
+			except Exception as e:
+				self.logger.debug(f'Input text with click and type failed, trying element handle method: {e}')
+				pass
+
 			# Get element properties to determine input method
 			tag_handle = await element_handle.get_property('tagName')
 			tag_name = (await tag_handle.json_value()).lower()
@@ -2947,25 +2944,15 @@ class BrowserSession(BaseModel):
 			readonly = await readonly_handle.json_value() if readonly_handle else False
 			disabled = await disabled_handle.json_value() if disabled_handle else False
 
-			# always click the element first to make sure it's in the focus
-			await element_handle.click()
-			await asyncio.sleep(0.1)
-
 			try:
 				if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled):
 					await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}')
 					await element_handle.type(text, delay=5)
 				else:
 					await element_handle.fill(text)
-			except Exception:
-				# last resort fallback, assume it's already focused after we clicked on it,
-				# just simulate keypresses on the entire page
-				try:
-					page = await self.get_current_page()
-					await page.keyboard.type(text)
-				except Exception as fallback_error:
-					# If we can't even get the current page, re-raise with a clear error
-					raise BrowserError(f'Failed to input text into element: {element_node.xpath}') from fallback_error
+			except Exception as e:
+				self.logger.error(f'Error during input text into element: {type(e).__name__}: {e}')
+				raise BrowserError(f'Failed to input text into element: {repr(element_node)}')
 
 		except Exception as e:
 			# Get current page URL safely for error message
@@ -3033,9 +3020,9 @@ class BrowserSession(BaseModel):
 		except Exception:
 			self.initialized = False
 
-		if not self.initialized or not self.is_connected():
+		if not self.initialized or not self.browser_context:
 			# If we were initialized but lost connection, reset state first to avoid infinite loops
-			if self.initialized and not self.is_connected():
+			if self.initialized and not self.browser_context:
 				self.logger.warning(
 					f'💔 Browser {self._connection_str} disconnected while trying to create a new tab, reconnecting...'
 				)
@@ -3068,7 +3055,7 @@ class BrowserSession(BaseModel):
 				await new_page.goto(url, wait_until='domcontentloaded')
 				await self._wait_for_page_and_frames_load(timeout_overwrite=1)
 			except Exception as e:
-				self.logger.error(f'❌ Error navigating to {url}: {type(e).__name__}: {e}')
+				self.logger.error(f'❌ Error navigating to {url}: {type(e).__name__}: {e} (proceeding anyway...)')
 
 		assert self.human_current_page is not None
 		assert self.agent_current_page is not None
@@ -3109,6 +3096,23 @@ class BrowserSession(BaseModel):
 		element_handle = await self.get_locate_element(selector_map[index])
 		return element_handle
 
+	async def is_file_input_by_index(self, index: int) -> bool:
+		try:
+			selector_map = await self.get_selector_map()
+			node = selector_map[index]
+			return self.is_file_input(node)
+		except Exception as e:
+			self.logger.debug(f'❌ Error in is_file_input(index={index}): {type(e).__name__}: {e}')
+			return False
+
+	@staticmethod
+	def is_file_input(node: DOMElementNode) -> bool:
+		return (
+			isinstance(node, DOMElementNode)
+			and getattr(node, 'tag_name', '').lower() == 'input'
+			and node.attributes.get('type', '').lower() == 'file'
+		)
+
 	@require_initialization
 	async def find_file_upload_element_by_index(
 		self, index: int, max_height: int = 3, max_descendant_depth: int = 3
@@ -3128,17 +3132,10 @@ class BrowserSession(BaseModel):
 
 			candidate_element = selector_map[index]
 
-			def is_file_input(node: DOMElementNode) -> bool:
-				return (
-					isinstance(node, DOMElementNode)
-					and getattr(node, 'tag_name', '').lower() == 'input'
-					and node.attributes.get('type', '').lower() == 'file'
-				)
-
 			def find_file_input_in_descendants(node: DOMElementNode, depth: int) -> DOMElementNode | None:
 				if depth < 0 or not isinstance(node, DOMElementNode):
 					return None
-				if is_file_input(node):
+				if self.is_file_input(node):
 					return node
 				for child in getattr(node, 'children', []):
 					result = find_file_input_in_descendants(child, depth - 1)
@@ -3149,7 +3146,7 @@ class BrowserSession(BaseModel):
 			current = candidate_element
 			for _ in range(max_height + 1):  # include the candidate itself
 				# 1. Check the current node itself
-				if is_file_input(current):
+				if self.is_file_input(current):
 					return current
 				# 2. Check all descendants of the current node
 				result = find_file_input_in_descendants(current, max_descendant_depth)
@@ -3161,7 +3158,7 @@ class BrowserSession(BaseModel):
 					for sibling in getattr(parent, 'children', []):
 						if sibling is current:
 							continue
-						if is_file_input(sibling):
+						if self.is_file_input(sibling):
 							return sibling
 						result = find_file_input_in_descendants(sibling, max_descendant_depth)
 						if result:
@@ -3226,7 +3223,7 @@ class BrowserSession(BaseModel):
 		Injects a DVD screensaver-style bouncing logo loading animation overlay into the given Playwright Page.
 		This is used to visually indicate that the browser is setting up or waiting.
 		"""
-		if os.environ.get('IS_IN_EVALS', 'false').lower()[0] in 'ty1':
+		if CONFIG.IS_IN_EVALS:
 			# dont bother wasting CPU showing animations during evals
 			return
 
diff --git a/browser_use/cli.py b/browser_use/cli.py
index 6850f5194..dfc5c1775 100644
--- a/browser_use/cli.py
+++ b/browser_use/cli.py
@@ -1,3 +1,4 @@
+# pyright: reportMissingImports=false
 import asyncio
 import json
 import logging
@@ -39,20 +40,17 @@ os.environ['BROWSER_USE_LOGGING_LEVEL'] = 'result'
 
 from browser_use import Agent, Controller
 from browser_use.agent.views import AgentSettings
-from browser_use.browser import BrowserSession
+from browser_use.browser import BrowserProfile, BrowserSession
+from browser_use.config import CONFIG
 from browser_use.logging_config import addLoggingLevel
 
-# Paths
-USER_CONFIG_DIR = Path.home() / '.config' / 'browseruse'
-USER_CONFIG_FILE = USER_CONFIG_DIR / 'config.json'
-CHROME_PROFILES_DIR = USER_CONFIG_DIR / 'profiles'
-USER_DATA_DIR = CHROME_PROFILES_DIR / 'cli'
+USER_DATA_DIR = CONFIG.BROWSER_USE_PROFILES_DIR / 'cli'
 
 # Default User settings
 MAX_HISTORY_LENGTH = 100
 
 # Ensure directories exist
-USER_CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
+CONFIG.BROWSER_USE_CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
 USER_DATA_DIR.mkdir(parents=True, exist_ok=True)
 
 
@@ -90,11 +88,11 @@ def get_default_config() -> dict[str, Any]:
 			'name': None,
 			'temperature': 0.0,
 			'api_keys': {
-				'OPENAI_API_KEY': os.getenv('OPENAI_API_KEY', ''),
-				'ANTHROPIC_API_KEY': os.getenv('ANTHROPIC_API_KEY', ''),
-				'GOOGLE_API_KEY': os.getenv('GOOGLE_API_KEY', ''),
-				'DEEPSEEK_API_KEY': os.getenv('DEEPSEEK_API_KEY', ''),
-				'GROK_API_KEY': os.getenv('GROK_API_KEY', ''),
+				'OPENAI_API_KEY': CONFIG.OPENAI_API_KEY,
+				'ANTHROPIC_API_KEY': CONFIG.ANTHROPIC_API_KEY,
+				'GOOGLE_API_KEY': CONFIG.GOOGLE_API_KEY,
+				'DEEPSEEK_API_KEY': CONFIG.DEEPSEEK_API_KEY,
+				'GROK_API_KEY': CONFIG.GROK_API_KEY,
 			},
 		},
 		'agent': {},  # AgentSettings will use defaults
@@ -109,14 +107,14 @@ def get_default_config() -> dict[str, Any]:
 
 def load_user_config() -> dict[str, Any]:
 	"""Load user configuration from file."""
-	if not USER_CONFIG_FILE.exists():
+	if not CONFIG.BROWSER_USE_CONFIG_FILE.exists():
 		# Create default config
 		config = get_default_config()
 		save_user_config(config)
 		return config
 
 	try:
-		with open(USER_CONFIG_FILE) as f:
+		with open(CONFIG.BROWSER_USE_CONFIG_FILE) as f:
 			data = json.load(f)
 			# Ensure data is a dictionary, not a list
 			if isinstance(data, list):
@@ -137,7 +135,7 @@ def save_user_config(config: dict[str, Any]) -> None:
 		if len(config['command_history']) > MAX_HISTORY_LENGTH:
 			config['command_history'] = config['command_history'][-MAX_HISTORY_LENGTH:]
 
-	with open(USER_CONFIG_FILE, 'w') as f:
+	with open(CONFIG.BROWSER_USE_CONFIG_FILE, 'w') as f:
 		json.dump(config, f, indent=2)
 
 
@@ -186,36 +184,38 @@ def get_llm(config: dict[str, Any]):
 	temperature = config.get('model', {}).get('temperature', 0.0)
 
 	# Set environment variables if they're in the config but not in the environment
-	if api_keys.get('openai') and not os.getenv('OPENAI_API_KEY'):
+	if api_keys.get('openai') and not CONFIG.OPENAI_API_KEY:
 		os.environ['OPENAI_API_KEY'] = api_keys['openai']
-	if api_keys.get('anthropic') and not os.getenv('ANTHROPIC_API_KEY'):
+	if api_keys.get('anthropic') and not CONFIG.ANTHROPIC_API_KEY:
 		os.environ['ANTHROPIC_API_KEY'] = api_keys['anthropic']
-	if api_keys.get('google') and not os.getenv('GOOGLE_API_KEY'):
+	if api_keys.get('google') and not CONFIG.GOOGLE_API_KEY:
 		os.environ['GOOGLE_API_KEY'] = api_keys['google']
 
 	if model_name:
 		if model_name.startswith('gpt'):
-			if not os.getenv('OPENAI_API_KEY'):
+			if not CONFIG.OPENAI_API_KEY:
 				print('⚠️  OpenAI API key not found. Please update your config or set OPENAI_API_KEY environment variable.')
 				sys.exit(1)
 			return langchain_openai.ChatOpenAI(model=model_name, temperature=temperature)
 		elif model_name.startswith('claude'):
-			if not os.getenv('ANTHROPIC_API_KEY'):
+			if not CONFIG.ANTHROPIC_API_KEY:
 				print('⚠️  Anthropic API key not found. Please update your config or set ANTHROPIC_API_KEY environment variable.')
 				sys.exit(1)
-			return langchain_anthropic.ChatAnthropic(model=model_name, temperature=temperature)
+			return langchain_anthropic.ChatAnthropic(model_name=model_name, temperature=temperature, timeout=30, stop=None)
 		elif model_name.startswith('gemini'):
-			if not os.getenv('GOOGLE_API_KEY'):
+			if not CONFIG.GOOGLE_API_KEY:
 				print('⚠️  Google API key not found. Please update your config or set GOOGLE_API_KEY environment variable.')
 				sys.exit(1)
 			return langchain_google_genai.ChatGoogleGenerativeAI(model=model_name, temperature=temperature)
 
 	# Auto-detect based on available API keys
-	if os.getenv('OPENAI_API_KEY'):
+	if CONFIG.OPENAI_API_KEY:
 		return langchain_openai.ChatOpenAI(model='gpt-4o', temperature=temperature)
-	elif os.getenv('ANTHROPIC_API_KEY'):
-		return langchain_anthropic.ChatAnthropic(model='claude-3.5-sonnet-exp', temperature=temperature)
-	elif os.getenv('GOOGLE_API_KEY'):
+	elif CONFIG.ANTHROPIC_API_KEY:
+		return langchain_anthropic.ChatAnthropic(
+			model_name='claude-3.5-sonnet-exp', temperature=temperature, timeout=30, stop=None
+		)
+	elif CONFIG.GOOGLE_API_KEY:
 		return langchain_google_genai.ChatGoogleGenerativeAI(model='gemini-2.0-flash-lite', temperature=temperature)
 	else:
 		print(
@@ -420,10 +420,10 @@ class BrowserUseApp(App):
 	def __init__(self, config: dict[str, Any], *args, **kwargs):
 		super().__init__(*args, **kwargs)
 		self.config = config
-		self.browser_session = None  # Will be set before app.run_async()
-		self.controller = None  # Will be set before app.run_async()
-		self.agent = None
-		self.llm = None  # Will be set before app.run_async()
+		self.browser_session: BrowserSession | None = None  # Will be set before app.run_async()
+		self.controller: Controller | None = None  # Will be set before app.run_async()
+		self.agent: Agent | None = None
+		self.llm: Any | None = None  # Will be set before app.run_async()
 		self.task_history = config.get('command_history', [])
 		# Track current position in history for up/down navigation
 		self.history_index = len(self.task_history)
@@ -437,7 +437,7 @@ class BrowserUseApp(App):
 			pass  # Level already exists, which is fine
 
 		# Get the RichLog widget
-		rich_log = self.query_one('#results-log')
+		rich_log = self.query_one('#results-log', RichLog)
 
 		# Create and set up the custom handler
 		log_handler = RichLogHandler(rich_log)
@@ -530,7 +530,7 @@ class BrowserUseApp(App):
 		# Step 3: Focus the input field
 		logger.debug('Focusing input field...')
 		try:
-			input_field = self.query_one('#task-input')
+			input_field = self.query_one('#task-input', Input)
 			input_field.focus()
 			logger.debug('Input field focused')
 		except Exception as e:
@@ -550,8 +550,9 @@ class BrowserUseApp(App):
 
 	def on_input_key_up(self, event: events.Key) -> None:
 		"""Handle up arrow key in the input field."""
-		# Check if event is from the input field
-		if event.sender.id != 'task-input':
+		# For textual key events, we need to check focus manually
+		input_field = self.query_one('#task-input', Input)
+		if not input_field.has_focus:
 			return
 
 		# Only process if we have history
@@ -561,9 +562,10 @@ class BrowserUseApp(App):
 		# Move back in history if possible
 		if self.history_index > 0:
 			self.history_index -= 1
-			self.query_one('#task-input').value = self.task_history[self.history_index]
+			task_input = self.query_one('#task-input', Input)
+			task_input.value = self.task_history[self.history_index]
 			# Move cursor to end of text
-			self.query_one('#task-input').cursor_position = len(self.query_one('#task-input').value)
+			task_input.cursor_position = len(task_input.value)
 
 		# Prevent default behavior (cursor movement)
 		event.prevent_default()
@@ -571,8 +573,9 @@ class BrowserUseApp(App):
 
 	def on_input_key_down(self, event: events.Key) -> None:
 		"""Handle down arrow key in the input field."""
-		# Check if event is from the input field
-		if event.sender.id != 'task-input':
+		# For textual key events, we need to check focus manually
+		input_field = self.query_one('#task-input', Input)
+		if not input_field.has_focus:
 			return
 
 		# Only process if we have history
@@ -582,13 +585,14 @@ class BrowserUseApp(App):
 		# Move forward in history or clear input if at the end
 		if self.history_index < len(self.task_history) - 1:
 			self.history_index += 1
-			self.query_one('#task-input').value = self.task_history[self.history_index]
+			task_input = self.query_one('#task-input', Input)
+			task_input.value = self.task_history[self.history_index]
 			# Move cursor to end of text
-			self.query_one('#task-input').cursor_position = len(self.query_one('#task-input').value)
+			task_input.cursor_position = len(task_input.value)
 		elif self.history_index == len(self.task_history) - 1:
 			# At the end of history, go to "new line" state
 			self.history_index += 1
-			self.query_one('#task-input').value = ''
+			self.query_one('#task-input', Input).value = ''
 
 		# Prevent default behavior (cursor movement)
 		event.prevent_default()
@@ -677,7 +681,7 @@ class BrowserUseApp(App):
 
 	def update_browser_panel(self) -> None:
 		"""Update browser information panel with details about the browser."""
-		browser_info = self.query_one('#browser-info')
+		browser_info = self.query_one('#browser-info', RichLog)
 		browser_info.clear()
 
 		# Try to use the agent's browser session if available
@@ -772,7 +776,7 @@ class BrowserUseApp(App):
 
 	def update_model_panel(self) -> None:
 		"""Update model information panel with details about the LLM."""
-		model_info = self.query_one('#model-info')
+		model_info = self.query_one('#model-info', RichLog)
 		model_info.clear()
 
 		if self.llm:
@@ -810,8 +814,12 @@ class BrowserUseApp(App):
 					# Get the last step metadata to show the most recent LLM response time
 				if num_steps > 0 and self.agent.state.history.history[-1].metadata:
 					last_step = self.agent.state.history.history[-1]
-					step_duration = last_step.metadata.duration_seconds
-					step_tokens = last_step.metadata.input_tokens
+					if last_step.metadata:
+						step_duration = last_step.metadata.duration_seconds
+						step_tokens = last_step.metadata.input_tokens
+					else:
+						step_duration = 0
+						step_tokens = 0
 
 					if step_tokens > 0:
 						tokens_per_second = step_tokens / step_duration if step_duration > 0 else 0
@@ -827,7 +835,7 @@ class BrowserUseApp(App):
 
 				# Add current state information
 				if hasattr(self.agent, 'running'):
-					if self.agent.running:
+					if getattr(self.agent, 'running', False):
 						model_info.write('[yellow]LLM is thinking[blink]...[/][/]')
 					elif hasattr(self.agent, 'state') and hasattr(self.agent.state, 'paused') and self.agent.state.paused:
 						model_info.write('[orange]LLM paused[/]')
@@ -836,7 +844,7 @@ class BrowserUseApp(App):
 
 	def update_tasks_panel(self) -> None:
 		"""Update tasks information panel with details about the tasks and steps hierarchy."""
-		tasks_info = self.query_one('#tasks-info')
+		tasks_info = self.query_one('#tasks-info', RichLog)
 		tasks_info.clear()
 
 		if self.agent:
@@ -942,7 +950,7 @@ class BrowserUseApp(App):
 						tasks_info.write('')
 
 			# If agent is actively running, show a status indicator
-			if hasattr(self.agent, 'running') and self.agent.running:
+			if hasattr(self.agent, 'running') and getattr(self.agent, 'running', False):
 				tasks_info.write('[yellow]Agent is actively working[blink]...[/][/]')
 			elif hasattr(self.agent, 'state') and hasattr(self.agent.state, 'paused') and self.agent.state.paused:
 				tasks_info.write('[orange]Agent is paused (press Enter to resume)[/]')
@@ -973,14 +981,16 @@ class BrowserUseApp(App):
 		self.update_info_panels()
 
 		# Clear the log to start fresh
-		rich_log = self.query_one('#results-log')
+		rich_log = self.query_one('#results-log', RichLog)
 		rich_log.clear()
 
 		if self.agent is None:
+			if not self.llm:
+				raise RuntimeError('LLM not initialized')
 			self.agent = Agent(
 				task=task,
 				llm=self.llm,
-				controller=self.controller,
+				controller=self.controller if self.controller else Controller(),
 				browser_session=self.browser_session,
 				source='cli',
 				**agent_settings.model_dump(),
@@ -996,19 +1006,22 @@ class BrowserUseApp(App):
 			logger.debug('\n🚀 Working on task: %s', task)
 
 			# Set flags to indicate the agent is running
-			self.agent.running = True
-			self.agent.last_response_time = 0
+			if self.agent:
+				self.agent.running = True  # type: ignore
+				self.agent.last_response_time = 0  # type: ignore
 
 			# Panel updates are already happening via the timer in update_info_panels
 
 			try:
 				# Run the agent task, redirecting output to RichLog through our handler
-				await self.agent.run()
+				if self.agent:
+					await self.agent.run()
 			except Exception as e:
 				logger.error('\nError running agent: %s', str(e))
 			finally:
 				# Clear the running flag
-				self.agent.running = False
+				if self.agent:
+					self.agent.running = False  # type: ignore
 
 				# No need to call update_info_panels() here as it's already updating via timer
 
@@ -1019,7 +1032,7 @@ class BrowserUseApp(App):
 				task_input_container.display = True
 
 				# Refocus the input field
-				input_field = self.query_one('#task-input')
+				input_field = self.query_one('#task-input', Input)
 				input_field.focus()
 
 				# Ensure the input is visible by scrolling to it
@@ -1031,7 +1044,7 @@ class BrowserUseApp(App):
 	def action_input_history_prev(self) -> None:
 		"""Navigate to the previous item in command history."""
 		# Only process if we have history and input is focused
-		input_field = self.query_one('#task-input')
+		input_field = self.query_one('#task-input', Input)
 		if not input_field.has_focus or not self.task_history:
 			return
 
@@ -1045,7 +1058,7 @@ class BrowserUseApp(App):
 	def action_input_history_next(self) -> None:
 		"""Navigate to the next item in command history or clear input."""
 		# Only process if we have history and input is focused
-		input_field = self.query_one('#task-input')
+		input_field = self.query_one('#task-input', Input)
 		if not input_field.has_focus or not self.task_history:
 			return
 
@@ -1131,7 +1144,7 @@ class BrowserUseApp(App):
 
 			# Paths panel
 			yield Static(
-				f' ⚙️  Settings & history saved to:    {str(USER_CONFIG_FILE.resolve()).replace(str(Path.home()), "~")}\n'
+				f' ⚙️  Settings & history saved to:    {str(CONFIG.BROWSER_USE_CONFIG_FILE.resolve()).replace(str(Path.home()), "~")}\n'
 				f' 📁 Outputs & recordings saved to:  {str(Path(".").resolve()).replace(str(Path.home()), "~")}',
 				id='paths-panel',
 				markup=True,
@@ -1176,10 +1189,10 @@ async def run_prompt_mode(prompt: str, ctx: click.Context, debug: bool = False):
 
 		# Create browser session with config parameters
 		browser_config = config.get('browser', {})
+		# Create BrowserProfile with user_data_dir
+		profile = BrowserProfile(user_data_dir=str(USER_DATA_DIR), **browser_config)
 		browser_session = BrowserSession(
-			stealth=True,
-			user_data_dir=USER_DATA_DIR,
-			**browser_config,
+			browser_profile=profile,
 		)
 
 		# Create and run agent
@@ -1239,19 +1252,17 @@ async def textual_interface(config: dict[str, Any]):
 			logger.info('Browser mode: visible')
 
 		# Create BrowserSession directly with config parameters
+		# Create BrowserProfile with user_data_dir
+		profile = BrowserProfile(user_data_dir=str(USER_DATA_DIR), **browser_config)
 		browser_session = BrowserSession(
-			stealth=True,
-			user_data_dir=USER_DATA_DIR,
-			**browser_config,
+			browser_profile=profile,
 		)
 		logger.debug('BrowserSession initialized successfully')
 
 		# Log browser version if available
 		try:
-			if hasattr(browser_session, 'version') and browser_session.version:
-				logger.info(f'Browser version: {browser_session.version}')
-			elif hasattr(browser_session, 'playwright_browser') and browser_session.playwright_browser:
-				version = browser_session.playwright_browser.version
+			if hasattr(browser_session, 'browser') and browser_session.browser:
+				version = browser_session.browser.version
 				logger.info(f'Browser version: {version}')
 		except Exception as e:
 			logger.debug(f'Could not determine browser version: {e}')
@@ -1375,7 +1386,7 @@ def main(ctx: click.Context, debug: bool = False, **kwargs):
 	logger.debug('Loading user configuration...')
 	try:
 		config = load_user_config()
-		logger.debug(f'User configuration loaded from {USER_CONFIG_FILE}')
+		logger.debug(f'User configuration loaded from {CONFIG.BROWSER_USE_CONFIG_FILE}')
 	except Exception as e:
 		logger.error(f'Error loading user configuration: {str(e)}', exc_info=True)
 		print(f'Error loading configuration: {str(e)}')
diff --git a/browser_use/config.py b/browser_use/config.py
new file mode 100644
index 000000000..f36d8cf3f
--- /dev/null
+++ b/browser_use/config.py
@@ -0,0 +1,161 @@
+"""Lazy-loading configuration system for browser-use environment variables."""
+
+import os
+from functools import cache
+from pathlib import Path
+
+import psutil
+
+
+@cache
+def is_running_in_docker() -> bool:
+	"""Detect if we are running in a docker container, for the purpose of optimizing chrome launch flags (dev shm usage, gpu settings, etc.)"""
+	try:
+		if Path('/.dockerenv').exists() or 'docker' in Path('/proc/1/cgroup').read_text().lower():
+			return True
+	except Exception:
+		pass
+
+	try:
+		# if init proc (PID 1) looks like uvicorn/python/uv/etc. then we're in Docker
+		# if init proc (PID 1) looks like bash/systemd/init/etc. then we're probably NOT in Docker
+		init_cmd = ' '.join(psutil.Process(1).cmdline())
+		if ('py' in init_cmd) or ('uv' in init_cmd) or ('app' in init_cmd):
+			return True
+	except Exception:
+		pass
+
+	try:
+		# if less than 10 total running procs, then we're almost certainly in a container
+		if len(psutil.pids()) < 10:
+			return True
+	except Exception:
+		pass
+
+	return False
+
+
+class Config:
+	"""Lazy-loading configuration class for environment variables (env vars can change at runtime so we need to get them fresh on every access)"""
+
+	# Cache for directory creation tracking
+	_dirs_created = False
+
+	@property
+	def BROWSER_USE_LOGGING_LEVEL(self) -> str:
+		return os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
+
+	@property
+	def ANONYMIZED_TELEMETRY(self) -> bool:
+		return os.getenv('ANONYMIZED_TELEMETRY', 'true').lower()[:1] in 'ty1'
+
+	@property
+	def BROWSER_USE_CLOUD_SYNC(self) -> bool:
+		return os.getenv('BROWSER_USE_CLOUD_SYNC', str(self.ANONYMIZED_TELEMETRY)).lower()[:1] in 'ty1'
+
+	@property
+	def BROWSER_USE_CLOUD_API_URL(self) -> str:
+		url = os.getenv('BROWSER_USE_CLOUD_API_URL', 'https://api.browser-use.com')
+		assert '://' in url, 'BROWSER_USE_CLOUD_API_URL must be a valid URL'
+		return url
+
+	@property
+	def BROWSER_USE_CLOUD_UI_URL(self) -> str:
+		url = os.getenv('BROWSER_USE_CLOUD_UI_URL', '')
+		# Allow empty string as default, only validate if set
+		if url and '://' not in url:
+			raise AssertionError('BROWSER_USE_CLOUD_UI_URL must be a valid URL if set')
+		return url
+
+	# Path configuration
+	@property
+	def XDG_CACHE_HOME(self) -> Path:
+		return Path(os.getenv('XDG_CACHE_HOME', '~/.cache')).expanduser().resolve()
+
+	@property
+	def XDG_CONFIG_HOME(self) -> Path:
+		return Path(os.getenv('XDG_CONFIG_HOME', '~/.config')).expanduser().resolve()
+
+	@property
+	def BROWSER_USE_CONFIG_DIR(self) -> Path:
+		path = Path(os.getenv('BROWSER_USE_CONFIG_DIR', str(self.XDG_CONFIG_HOME / 'browseruse'))).expanduser().resolve()
+		self._ensure_dirs()
+		return path
+
+	@property
+	def BROWSER_USE_CONFIG_FILE(self) -> Path:
+		return self.BROWSER_USE_CONFIG_DIR / 'config.json'
+
+	@property
+	def BROWSER_USE_PROFILES_DIR(self) -> Path:
+		path = self.BROWSER_USE_CONFIG_DIR / 'profiles'
+		self._ensure_dirs()
+		return path
+
+	@property
+	def BROWSER_USE_DEFAULT_USER_DATA_DIR(self) -> Path:
+		return self.BROWSER_USE_PROFILES_DIR / 'default'
+
+	def _ensure_dirs(self) -> None:
+		"""Create directories if they don't exist (only once)"""
+		if not self._dirs_created:
+			config_dir = (
+				Path(os.getenv('BROWSER_USE_CONFIG_DIR', str(self.XDG_CONFIG_HOME / 'browseruse'))).expanduser().resolve()
+			)
+			config_dir.mkdir(parents=True, exist_ok=True)
+			(config_dir / 'profiles').mkdir(parents=True, exist_ok=True)
+			self._dirs_created = True
+
+	# LLM API key configuration
+	@property
+	def OPENAI_API_KEY(self) -> str:
+		return os.getenv('OPENAI_API_KEY', '')
+
+	@property
+	def ANTHROPIC_API_KEY(self) -> str:
+		return os.getenv('ANTHROPIC_API_KEY', '')
+
+	@property
+	def GOOGLE_API_KEY(self) -> str:
+		return os.getenv('GOOGLE_API_KEY', '')
+
+	@property
+	def DEEPSEEK_API_KEY(self) -> str:
+		return os.getenv('DEEPSEEK_API_KEY', '')
+
+	@property
+	def GROK_API_KEY(self) -> str:
+		return os.getenv('GROK_API_KEY', '')
+
+	@property
+	def NOVITA_API_KEY(self) -> str:
+		return os.getenv('NOVITA_API_KEY', '')
+
+	@property
+	def AZURE_OPENAI_ENDPOINT(self) -> str:
+		return os.getenv('AZURE_OPENAI_ENDPOINT', '')
+
+	@property
+	def AZURE_OPENAI_KEY(self) -> str:
+		return os.getenv('AZURE_OPENAI_KEY', '')
+
+	@property
+	def SKIP_LLM_API_KEY_VERIFICATION(self) -> bool:
+		return os.getenv('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[:1] in 'ty1'
+
+	# Runtime hints
+	@property
+	def IN_DOCKER(self) -> bool:
+		return os.getenv('IN_DOCKER', 'false').lower()[:1] in 'ty1' or is_running_in_docker()
+
+	@property
+	def IS_IN_EVALS(self) -> bool:
+		return os.getenv('IS_IN_EVALS', 'false').lower()[:1] in 'ty1'
+
+	@property
+	def WIN_FONT_DIR(self) -> str:
+		return os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts')
+
+
+# Create a singleton instance
+CONFIG = Config()
diff --git a/browser_use/controller/registry/service.py b/browser_use/controller/registry/service.py
index b1424c8f4..86b6ff674 100644
--- a/browser_use/controller/registry/service.py
+++ b/browser_use/controller/registry/service.py
@@ -203,6 +203,10 @@ class Registry(Generic[Context]):
 								raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
 							elif param.name == 'page':
 								raise ValueError(f'Action {func.__name__} requires page but none provided.')
+							elif param.name == 'available_file_paths':
+								raise ValueError(f'Action {func.__name__} requires available_file_paths but none provided.')
+							elif param.name == 'file_system':
+								raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
 							else:
 								raise ValueError(f"{func.__name__}() missing required special parameter '{param.name}'")
 						call_args.append(value)
@@ -218,6 +222,10 @@ class Registry(Generic[Context]):
 							raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
 						elif param.name == 'page':
 							raise ValueError(f'Action {func.__name__} requires page but none provided.')
+						elif param.name == 'available_file_paths':
+							raise ValueError(f'Action {func.__name__} requires available_file_paths but none provided.')
+						elif param.name == 'file_system':
+							raise ValueError(f'Action {func.__name__} requires file_system but none provided.')
 						else:
 							raise ValueError(f"{func.__name__}() missing required special parameter '{param.name}'")
 				else:
diff --git a/browser_use/controller/registry/views.py b/browser_use/controller/registry/views.py
index 84353a640..fc371ff63 100644
--- a/browser_use/controller/registry/views.py
+++ b/browser_use/controller/registry/views.py
@@ -1,5 +1,5 @@
 from collections.abc import Callable
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from langchain_core.language_models.chat_models import BaseChatModel
 from pydantic import BaseModel, ConfigDict
@@ -9,7 +9,7 @@ from browser_use.browser.types import Page
 from browser_use.filesystem.file_system import FileSystem
 
 if TYPE_CHECKING:
-	from browser_use.agent.service import Context
+	pass
 
 
 class RegisteredAction(BaseModel):
@@ -153,7 +153,7 @@ class SpecialActionParameters(BaseModel):
 	# e.g. can contain anything, external db connections, file handles, queues, runtime config objects, etc.
 	# that you might want to be able to access quickly from within many of your actions
 	# browser-use code doesn't use this at all, we just pass it down to your actions for convenience
-	context: 'Context | None' = None
+	context: Any | None = None
 
 	# browser-use session object, can be used to create new tabs, navigate, access playwright objects, etc.
 	browser_session: BrowserSession | None = None
diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py
index 2a725bece..1d20df52f 100644
--- a/browser_use/controller/service.py
+++ b/browser_use/controller/service.py
@@ -174,9 +174,10 @@ class Controller(Generic[Context]):
 				# SECURITY FIX: Use browser_session.navigate_to() instead of direct page.goto()
 				# This ensures URL validation against allowed_domains is performed
 				await browser_session.navigate_to(params.url)
-				msg = f'🔗  Navigated to {params.url}'
+				memory = f'Navigated to {params.url}'
+				msg = f'🔗 {memory}'
 				logger.info(msg)
-				return ActionResult(extracted_content=msg, include_in_memory=True)
+				return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=memory)
 			except Exception as e:
 				error_msg = str(e)
 				# Check for network-related errors
@@ -239,7 +240,7 @@ class Controller(Generic[Context]):
 			initial_pages = len(browser_session.tabs)
 
 			# if element has file uploader then dont click
-			if await browser_session.find_file_upload_element_by_index(params.index) is not None:
+			if await browser_session.is_file_input_by_index(params.index):
 				msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
 				logger.info(msg)
 				return ActionResult(extracted_content=msg, include_in_memory=True, success=False, long_term_memory=msg)
@@ -278,7 +279,7 @@ class Controller(Generic[Context]):
 					return ActionResult(error=error_msg, success=False)
 
 		@self.registry.action(
-			'Input text into a input interactive element',
+			'Click and input text into a input interactive element',
 			param_model=InputTextAction,
 		)
 		async def input_text(params: InputTextAction, browser_session: BrowserSession, has_sensitive_data: bool = False):
@@ -287,7 +288,12 @@ class Controller(Generic[Context]):
 
 			element_node = await browser_session.get_dom_element_by_index(params.index)
 			assert element_node is not None, f'Element with index {params.index} does not exist'
-			await browser_session._input_text_element_node(element_node, params.text)
+			try:
+				await browser_session._input_text_element_node(element_node, params.text)
+			except Exception:
+				msg = f'Failed to input text into element {params.index}.'
+				return ActionResult(error=msg)
+
 			if not has_sensitive_data:
 				msg = f'⌨️  Input {params.text} into index {params.index}'
 			else:
@@ -367,6 +373,7 @@ Only use this for extracting info from a single product/article page, not for en
 			query: str,
 			page: Page,
 			page_extraction_llm: BaseChatModel,
+			file_system: FileSystem,
 		):
 			from functools import partial
 
@@ -434,13 +441,24 @@ Explain the content of the page and that the requested information is not availa
 				output = await page_extraction_llm.ainvoke(template.format(query=query, page=content))
 				output_text = output.content
 				extracted_content = f'Page Link: {page.url}\nQuery: {query}\nExtracted Content:\n{output_text}'
-
 				# if content is small include it to memory
-				if len(extracted_content) < 1000:
+				MAX_MEMORY_SIZE = 600
+				if len(extracted_content) < MAX_MEMORY_SIZE:
 					memory = extracted_content
 					include_extracted_content_only_once = False
 				else:
-					memory = f'Extracted content from {page.url} for query "{query}"'
+					# find lines until MAX_MEMORY_SIZE
+					lines = extracted_content.splitlines()
+					display = ''
+					display_lines_count = 0
+					for line in lines:
+						if len(display) + len(line) < MAX_MEMORY_SIZE:
+							display += line + '\n'
+							display_lines_count += 1
+						else:
+							break
+					save_result = await file_system.save_extracted_content(extracted_content)
+					memory = f'Extracted content from {page.url}\n<query>{query}\n</query>\n<extracted_content>\n{display}{len(lines) - display_lines_count} more lines...\n</extracted_content>\n<file_system>{save_result}</file_system>'
 					include_extracted_content_only_once = True
 				logger.info(f'📄 {memory}')
 				return ActionResult(
@@ -502,7 +520,7 @@ Explain the content of the page and that the requested information is not availa
 				dy = dy_result
 
 			try:
-				await browser_session._scroll_container(dy)
+				await browser_session._scroll_container(cast(int, dy))
 			except Exception as e:
 				# Hard fallback: always works on root scroller
 				await page.evaluate('(y) => window.scrollBy(0, y)', dy)
@@ -530,7 +548,7 @@ Explain the content of the page and that the requested information is not availa
 				)
 				if action_result:
 					return action_result
-				dy = -(dy_result)
+				dy = -(dy_result or 0)
 
 			try:
 				await browser_session._scroll_container(dy)
@@ -615,6 +633,50 @@ Explain the content of the page and that the requested information is not availa
 				logger.error(msg)
 				return ActionResult(error=msg, include_in_memory=True)
 
+		# File System Actions
+		@self.registry.action('Write content to file_name in file system, use only .md or .txt extensions.')
+		async def write_file(file_name: str, content: str, file_system: FileSystem):
+			result = await file_system.write_file(file_name, content)
+			logger.info(f'💾 {result}')
+			return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
+
+		@self.registry.action('Append content to file_name in file system')
+		async def append_file(file_name: str, content: str, file_system: FileSystem):
+			result = await file_system.append_file(file_name, content)
+			logger.info(f'💾 {result}')
+			return ActionResult(extracted_content=result, include_in_memory=True, long_term_memory=result)
+
+		@self.registry.action('Read file_name from file system')
+		async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem):
+			if available_file_paths and file_name in available_file_paths:
+				import anyio
+
+				async with await anyio.open_file(file_name, 'r') as f:
+					content = await f.read()
+					result = f'Read from file {file_name}.\n<content>\n{content}\n</content>'
+			else:
+				result = await file_system.read_file(file_name)
+
+			MAX_MEMORY_SIZE = 1000
+			if len(result) > MAX_MEMORY_SIZE:
+				lines = result.splitlines()
+				display = ''
+				for line in lines:
+					if len(display) + len(line) < MAX_MEMORY_SIZE:
+						display += line + '\n'
+					else:
+						break
+				memory = f'{display}{len(lines) - len(display)} more lines...'
+			else:
+				memory = result
+			logger.info(f'💾 {memory}')
+			return ActionResult(
+				extracted_content=result,
+				include_in_memory=True,
+				long_term_memory=memory,
+				include_extracted_content_only_once=True,
+			)
+
 		@self.registry.action(
 			description='Get all options from a native dropdown',
 		)
diff --git a/browser_use/dom/buildDomTree.js b/browser_use/dom/buildDomTree.js
index 1dec1fe8d..a1b63833f 100644
--- a/browser_use/dom/buildDomTree.js
+++ b/browser_use/dom/buildDomTree.js
@@ -4,11 +4,10 @@
     focusHighlightIndex: -1,
     viewportExpansion: 0,
     debugMode: false,
-    initialIndex: 0,
   }
 ) => {
-  const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode, initialIndex } = args;
-  let highlightIndex = initialIndex; // Reset highlight index
+  const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode } = args;
+  let highlightIndex = 0; // Reset highlight index
 
   // Add timing stack to handle recursion
   const TIMING_STACK = {
@@ -211,7 +210,7 @@
    */
   const DOM_HASH_MAP = {};
 
-  const ID = { current: initialIndex };
+  const ID = { current: 0 };
 
   const HIGHLIGHT_CONTAINER_ID = "playwright-highlight-container";
 
@@ -837,7 +836,7 @@
         }
       }
 
-      const getEventListenersForNode = window.getEventListenersForNode;
+      const getEventListenersForNode = element?.ownerDocument?.defaultView?.getEventListenersForNode || window.getEventListenersForNode;
       if (typeof getEventListenersForNode === 'function') {
         const listeners = getEventListenersForNode(element);
           const interactionEvents = ['click', 'mousedown', 'mouseup', 'keydown', 'keyup', 'submit', 'change', 'input', 'focus', 'blur'];
@@ -1130,7 +1129,7 @@
     
     // Check for other common interaction event listeners
     try {
-      const getEventListenersForNode = window.getEventListenersForNode;
+      const getEventListenersForNode = element?.ownerDocument?.defaultView?.getEventListenersForNode || window.getEventListenersForNode;
       if (typeof getEventListenersForNode === 'function') {
         const listeners = getEventListenersForNode(element);
         const interactionEvents = ['click', 'mousedown', 'mouseup', 'keydown', 'keyup', 'submit', 'change', 'input', 'focus', 'blur'];
@@ -1355,7 +1354,6 @@
               if (domElement) nodeData.children.push(domElement);
             }
           }
-          nodeData.hasIframeContent = true; 
         } catch (e) {
           console.warn("Unable to access iframe:", e);
         }
diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py
index 2da530e63..6100fe150 100644
--- a/browser_use/dom/service.py
+++ b/browser_use/dom/service.py
@@ -6,7 +6,6 @@ from urllib.parse import urlparse
 if TYPE_CHECKING:
 	from browser_use.browser.types import Page
 
-from dataclasses import dataclass
 
 from browser_use.dom.views import (
 	DOMBaseNode,
@@ -24,42 +23,6 @@ from browser_use.utils import time_execution_async
 # 	height: int
 
 
-@dataclass
-class PageFrameEvaluationResult:
-	url: str
-	result: dict
-	name: str | None = None
-	id: str | None = None
-
-	@property
-	def known_frame_urls(self) -> list[str]:
-		return [
-			v.get('attributes', {}).get('src')
-			for v in self.map.values()
-			if v.get('hasIframeContent') and v.get('attributes', {}).get('src')
-		]
-
-	@property
-	def map(self) -> dict:
-		return self.result.get('map', {})
-
-	@property
-	def map_size(self) -> int:
-		return len(self.map)
-
-	@property
-	def perf_metrics(self) -> dict:
-		return self.result.get('perfMetrics', {})
-
-	@property
-	def short_url(self) -> str:
-		return self.url[:50] + '...' if len(self.url) > 50 else self.url
-
-	@property
-	def root_id(self) -> str | None:
-		return self.result.get('rootId')
-
-
 class DomService:
 	logger: logging.Logger
 
@@ -132,160 +95,73 @@ class DomService:
 			'focusHighlightIndex': focus_element,
 			'viewportExpansion': viewport_expansion,
 			'debugMode': debug_mode,
-			'initialIndex': 0,
 		}
 
 		try:
 			eval_page: dict = await self.page.evaluate(self.js_code, args)
-			page_eval_result = PageFrameEvaluationResult(
-				url=self.page.url,
-				result=eval_page,
-			)
 		except Exception as e:
 			self.logger.error('Error evaluating JavaScript: %s', e)
 			raise
 
-		frames = [page_eval_result]
-		total_map_size = page_eval_result.map_size
-
-		known_frame_urls = page_eval_result.known_frame_urls
-		# TODO: only look in iframes from enabled_domains
-		for iframe in self.page.frames:
-			if (
-				iframe.url
-				and iframe.url != self.page.url
-				and not iframe.url.startswith('data:')
-				and iframe.url not in known_frame_urls
-			):
-				try:
-					frame_element = await iframe.frame_element()
-				except Exception as e:
-					self.logger.error('Error getting frame element for iframe %s: %s', iframe.url, e)
-					continue
-
-				if not await frame_element.is_visible():
-					continue
-
-				args['initialIndex'] = total_map_size  # continue indexing from the last index
-				try:
-					name = await frame_element.get_attribute('name')
-					id = await frame_element.get_attribute('id')
-					iframe_eval_result = await iframe.evaluate(self.js_code, args)
-					frame = PageFrameEvaluationResult(
-						url=iframe.url,
-						result=iframe_eval_result,
-						name=name,
-						id=id,
-					)
-					frames.append(frame)
-					known_frame_urls.append(iframe.url)
-					known_frame_urls.extend(frame.known_frame_urls)
-					total_map_size += frame.map_size
-				except Exception as e:
-					self.logger.error('Error evaluating JavaScript in iframe %s: %s', iframe.url, e)
-					continue
-
 		# Only log performance metrics in debug mode
-		if debug_mode and len(frames) > 1:
-			for index, frame in enumerate(frames):
-				perf = frame.perf_metrics
-				if perf:
-					# Get key metrics for summary
-					total_nodes = perf.get('nodeMetrics', {}).get('totalNodes', 0)
-					# processed_nodes = perf.get('nodeMetrics', {}).get('processedNodes', 0)
+		if debug_mode and 'perfMetrics' in eval_page:
+			perf = eval_page['perfMetrics']
 
-					# Count interactive elements from the DOM map
-					interactive_count = 0
-					for node_data in frame.map.values():
-						if isinstance(node_data, dict) and node_data.get('isInteractive'):
-							interactive_count += 1
+			# Get key metrics for summary
+			total_nodes = perf.get('nodeMetrics', {}).get('totalNodes', 0)
+			# processed_nodes = perf.get('nodeMetrics', {}).get('processedNodes', 0)
 
-					# Create concise summary
-					self.logger.debug(
-						f'🔎 Ran buildDOMTree.js interactive element detection on{" iframe" if index > 0 else ""}: %s interactive=%d/%d\n',
-						frame.short_url,
-						interactive_count,
-						total_nodes,
-						# processed_nodes,
-					)
+			# Count interactive elements from the DOM map
+			interactive_count = 0
+			if 'map' in eval_page:
+				for node_data in eval_page['map'].values():
+					if isinstance(node_data, dict) and node_data.get('isInteractive'):
+						interactive_count += 1
 
-		return await self._construct_dom_tree(frames)
+			# Create concise summary
+			url_short = self.page.url[:50] + '...' if len(self.page.url) > 50 else self.page.url
+			self.logger.debug(
+				'🔎 Ran buildDOMTree.js interactive element detection on: %s interactive=%d/%d\n',
+				url_short,
+				interactive_count,
+				total_nodes,
+				# processed_nodes,
+			)
+
+		return await self._construct_dom_tree(eval_page)
 
 	@time_execution_async('--construct_dom_tree')
 	async def _construct_dom_tree(
 		self,
-		frames: list[PageFrameEvaluationResult],
+		eval_page: dict,
 	) -> tuple[DOMElementNode, SelectorMap]:
-		# The first page in eval_pages is the main page, and it contains the rootId
-		js_root_id = frames[0].root_id
-		if js_root_id is None:
-			raise ValueError('No rootId found in the evaluated page structure')
+		js_node_map = eval_page['map']
+		js_root_id = eval_page['rootId']
 
-		selector_map: SelectorMap = {}
-		node_map: dict[str, DOMBaseNode] = {}
+		selector_map = {}
+		node_map = {}
 
-		for frame in frames:
-			js_node_map = frame.map
-			for id, node_data in js_node_map.items():
-				node, children_ids = self._parse_node(node_data)
-				if node is None:
-					continue
+		for id, node_data in js_node_map.items():
+			node, children_ids = self._parse_node(node_data)
+			if node is None:
+				continue
 
-				node_map[id] = node
+			node_map[id] = node
 
-				if isinstance(node, DOMElementNode) and node.highlight_index is not None:
-					selector_map[node.highlight_index] = node
+			if isinstance(node, DOMElementNode) and node.highlight_index is not None:
+				selector_map[node.highlight_index] = node
 
-				# NOTE: We know that we are building the tree bottom up
-				#       and all children are already processed.
-				if isinstance(node, DOMElementNode):
-					for child_id in children_ids:
-						if child_id not in node_map:
-							continue
-
-						child_node = node_map[child_id]
-
-						child_node.parent = node
-						node.children.append(child_node)
-
-		# For each child iframe, we need to set the parent of the root element to the iframe element.
-		for frame in frames[1:]:
-			content_root_node = node_map.get(frame.root_id)
-			if content_root_node:
-				# Find the iframe element in the main page
-				iframe_element_node = next(
-					(
-						node
-						for node in node_map.values()
-						if isinstance(node, DOMElementNode)
-						and node.is_iframe_element(url=frame.url, name=frame.name, id=frame.id)
-					),
-					None,
-				)
-				if iframe_element_node:
-					if not iframe_element_node.children:
-						iframe_element_node.children = [content_root_node]
-						content_root_node.parent = iframe_element_node
+			# NOTE: We know that we are building the tree bottom up
+			#       and all children are already processed.
+			if isinstance(node, DOMElementNode):
+				for child_id in children_ids:
+					if child_id not in node_map:
 						continue
-					else:
-						self.logger.warning(
-							'Iframe element %s already has children, skipping',
-							frame.short_url,
-						)
-				else:
-					self.logger.warning(
-						'Could not find iframe element for %s in the main page DOM',
-						frame.short_url,
-					)
 
-			# If we could not find the iframe element, remove the frame's nodes from the maps.
-			for id in frame.map.keys():
-				node = node_map.get(id)
-				# Remove the node from the selector map if it has a highlight index
-				if isinstance(node, DOMElementNode) and node.highlight_index is not None and node.highlight_index in selector_map:
-					del selector_map[node.highlight_index]
+					child_node = node_map[child_id]
 
-				del node_map[id]
+					child_node.parent = node
+					node.children.append(child_node)
 
 		html_to_dict = node_map[str(js_root_id)]
 
diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py
index b9948054c..0964f37cc 100644
--- a/browser_use/dom/views.py
+++ b/browser_use/dom/views.py
@@ -223,21 +223,13 @@ class DOMElementNode(DOMBaseNode):
 			elif isinstance(node, DOMTextNode):
 				# Add text only if it doesn't have a highlighted parent
 				if (
-					node.parent.highlight_index is None and node.parent and node.parent.is_visible and node.parent.is_top_element
+					node.parent and node.parent.highlight_index is None and node.parent.is_visible and node.parent.is_top_element
 				):  # and node.is_parent_top_element()
 					formatted_text.append(f'{depth_str}{node.text}')
 
 		process_node(self, 0)
 		return '\n'.join(formatted_text)
 
-	def is_iframe_element(self, url: str, name: str | None = None, id: str | None = None) -> bool:
-		return (
-			self.tag_name.lower() == 'iframe'
-			and self.attributes.get('src') == url
-			and (name is None or self.attributes.get('name') == name)
-			and (id is None or self.attributes.get('id') == id)
-		)
-
 
 SelectorMap = dict[int, DOMElementNode]
 
diff --git a/browser_use/filesystem/file_system.py b/browser_use/filesystem/file_system.py
index b4562aa07..76d6eedbb 100644
--- a/browser_use/filesystem/file_system.py
+++ b/browser_use/filesystem/file_system.py
@@ -25,10 +25,17 @@ class FileSystem:
 		self.todo_file = self.dir / 'todo.md'
 		self.results_file.touch(exist_ok=True)
 		self.todo_file.touch(exist_ok=True)
+		self.extracted_content_count = 0
 
 	def get_dir(self) -> Path:
 		return self.dir
 
+	async def save_extracted_content(self, content: str) -> str:
+		extracted_content_file_name = f'extracted_content_{self.extracted_content_count}.md'
+		result = await self.write_file(extracted_content_file_name, content)
+		self.extracted_content_count += 1
+		return result
+
 	def _is_valid_filename(self, file_name: str) -> bool:
 		"""Check if filename matches the required pattern: name.extension"""
 		pattern = r'^[a-zA-Z0-9_\-]+\.(txt|md)$'
@@ -59,7 +66,7 @@ class FileSystem:
 			with ThreadPoolExecutor() as executor:
 				# Run file read in a thread to avoid blocking
 				content = await asyncio.get_event_loop().run_in_executor(executor, lambda: path.read_text())
-			return f'Read from file {file_name}:\n{content}'
+			return f'Read from file {file_name}.\n<content>\n{content}\n</content>'
 		except Exception:
 			return f"Error: Could not read file '{file_name}'."
 
@@ -98,17 +105,88 @@ class FileSystem:
 			return f"Error: Could not append to file '{file_name}'. {str(e)}"
 
 	def describe(self) -> str:
-		"""List all files with their line counts."""
-		description = ''
-		for f in self.dir.iterdir():
-			if f.is_file():
-				try:
-					num_lines = len(f.read_text().splitlines())
-					description += f'- {f.name} — {num_lines} lines\n'
-				except Exception:
-					description += f'- {f.name} — [error reading file]\n'
+		"""List all files with their content information.
 
-		return description
+		Example output:
+		<file>
+		results.md - 42 lines
+		<content>
+		{preview_start}
+		... {n_lines} more lines ...
+		{preview_end}
+		</content>
+		</file>
+		"""
+		DISPLAY_CHARS = 400  # Total characters to display (split between start and end)
+		description = ''
+
+		for f in self.dir.iterdir():
+			# Only process files and skip todo.md
+			if (not f.is_file()) or f.name == 'todo.md':
+				continue
+
+			try:
+				content = f.read_text()
+
+				# Handle empty files
+				if not content:
+					description += f'<file>\n{f.name} - [empty file]\n</file>\n\n'
+					continue
+
+				lines = content.splitlines()
+				line_count = len(lines)
+
+				# For small files, display the entire content
+				whole_file_description = f'<file>\n{f.name} - {line_count} lines\n<content>\n{content}\n</content>\n</file>\n'
+				if len(content) < int(1.5 * DISPLAY_CHARS):
+					description += whole_file_description
+					continue
+
+				# For larger files, display start and end previews
+				half_display_chars = DISPLAY_CHARS // 2
+
+				# Get start preview
+				start_preview = ''
+				start_line_count = 0
+				chars_count = 0
+				for line in lines:
+					if chars_count + len(line) + 1 > half_display_chars:
+						break
+					start_preview += line + '\n'
+					chars_count += len(line) + 1
+					start_line_count += 1
+
+				# Get end preview
+				end_preview = ''
+				end_line_count = 0
+				chars_count = 0
+				for line in reversed(lines):
+					if chars_count + len(line) + 1 > half_display_chars:
+						break
+					end_preview = line + '\n' + end_preview
+					chars_count += len(line) + 1
+					end_line_count += 1
+
+				# Calculate lines in between
+				middle_line_count = line_count - start_line_count - end_line_count
+				if middle_line_count <= 0:
+					# display the entire file
+					description += whole_file_description
+					continue
+
+				start_preview = start_preview.strip('\n').rstrip()
+				end_preview = end_preview.strip('\n').rstrip()
+
+				# Format output
+				description += f'<file>\n{f.name} - {line_count} lines\n<content>\n{start_preview}\n'
+				description += f'... {middle_line_count} more lines ...\n'
+				description += f'{end_preview}\n'
+				description += '</content>\n</file>\n'
+
+			except Exception:
+				description += f'<file>\n{f.name} - [error reading file]\n</file>\n\n'
+
+		return description.strip('\n')
 
 	def get_todo_contents(self) -> str:
 		return self.todo_file.read_text()
diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py
index d322907be..1d2f1c5a8 100644
--- a/browser_use/logging_config.py
+++ b/browser_use/logging_config.py
@@ -1,11 +1,12 @@
 import logging
-import os
 import sys
 
 from dotenv import load_dotenv
 
 load_dotenv()
 
+from browser_use.config import CONFIG
+
 
 def addLoggingLevel(levelName, levelNum, methodName=None):
 	"""
@@ -65,7 +66,7 @@ def setup_logging():
 	except AttributeError:
 		pass  # Level already exists, which is fine
 
-	log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
+	log_type = CONFIG.BROWSER_USE_LOGGING_LEVEL
 
 	# Check if handlers are already set up
 	if logging.getLogger().hasHandlers():
diff --git a/browser_use/sync/auth.py b/browser_use/sync/auth.py
index 3768b4643..a00922824 100644
--- a/browser_use/sync/auth.py
+++ b/browser_use/sync/auth.py
@@ -11,6 +11,8 @@ from datetime import datetime
 import httpx
 from pydantic import BaseModel
 
+from browser_use.config import CONFIG
+
 # Temporary user ID for pre-auth events (matches cloud backend)
 TEMP_USER_ID = '99999999-9999-9999-9999-999999999999'
 
@@ -25,9 +27,8 @@ class CloudAuthConfig(BaseModel):
 	@classmethod
 	def load_from_file(cls) -> 'CloudAuthConfig':
 		"""Load auth config from local file"""
-		from browser_use.utils import BROWSER_USE_CONFIG_DIR
 
-		config_path = BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
+		config_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
 		if config_path.exists():
 			try:
 				with open(config_path) as f:
@@ -40,11 +41,10 @@ class CloudAuthConfig(BaseModel):
 
 	def save_to_file(self) -> None:
 		"""Save auth config to local file"""
-		from browser_use.utils import BROWSER_USE_CONFIG_DIR
 
-		BROWSER_USE_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+		CONFIG.BROWSER_USE_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
 
-		config_path = BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
+		config_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'cloud_auth.json'
 		with open(config_path, 'w') as f:
 			json.dump(self.model_dump(mode='json'), f, indent=2, default=str)
 
@@ -61,7 +61,7 @@ class DeviceAuthClient:
 
 	def __init__(self, base_url: str | None = None, http_client: httpx.AsyncClient | None = None):
 		# Backend API URL for OAuth requests - can be passed directly or defaults to env var
-		self.base_url = base_url or os.getenv('BROWSER_USE_CLOUD_URL', 'https://cloud.browser-use.com')
+		self.base_url = base_url or CONFIG.BROWSER_USE_CLOUD_API_URL
 		self.client_id = 'library'
 		self.scope = 'read write'
 
@@ -124,8 +124,8 @@ class DeviceAuthClient:
 	async def poll_for_token(
 		self,
 		device_code: str,
-		interval: int = 5,
-		timeout: int = 1800,
+		interval: float = 3.0,
+		timeout: float = 1800.0,
 	) -> dict | None:
 		"""
 		Poll for the access token.
@@ -257,7 +257,7 @@ class DeviceAuthClient:
 			device_auth = await self.start_device_authorization(agent_session_id)
 
 			# Use frontend URL for user-facing links
-			frontend_url = os.getenv('BROWSER_USE_CLOUD_UI_URL', self.base_url)
+			frontend_url = CONFIG.BROWSER_USE_CLOUD_UI_URL or self.base_url.replace('//api.', '//cloud.')
 
 			# Replace backend URL with frontend URL in verification URIs
 			verification_uri = device_auth['verification_uri'].replace(self.base_url, frontend_url)
@@ -290,9 +290,13 @@ class DeviceAuthClient:
 		except Exception as e:
 			# Log the error details for debugging
 			if hasattr(e, 'response'):
-				logger.debug(
-					f'Failed to get pre-auth token for cloud sync: HTTP {e.response.status_code} - {e.response.text[:200]}'
-				)
+				response = getattr(e, 'response')
+				if hasattr(response, 'status_code') and hasattr(response, 'text'):
+					logger.debug(
+						f'Failed to get pre-auth token for cloud sync: HTTP {response.request.url} {response.status_code} - {response.text}'
+					)
+				else:
+					logger.debug(f'Failed to get pre-auth token for cloud sync: {type(e).__name__}: {e}')
 			else:
 				logger.debug(f'Failed to get pre-auth token for cloud sync: {type(e).__name__}: {e}')
 
diff --git a/browser_use/sync/service.py b/browser_use/sync/service.py
index d9f20de71..9de8870ea 100644
--- a/browser_use/sync/service.py
+++ b/browser_use/sync/service.py
@@ -5,12 +5,12 @@ Cloud sync service for sending events to the Browser Use cloud.
 import asyncio
 import json
 import logging
-import os
 
 import anyio
 import httpx
 from bubus import BaseEvent
 
+from browser_use.config import CONFIG
 from browser_use.sync.auth import TEMP_USER_ID, DeviceAuthClient
 
 logger = logging.getLogger(__name__)
@@ -21,10 +21,10 @@ class CloudSync:
 
 	def __init__(self, base_url: str | None = None, enable_auth: bool = True):
 		# Backend API URL for all API requests - can be passed directly or defaults to env var
-		self.base_url = base_url or os.getenv('BROWSER_USE_CLOUD_URL', 'https://cloud.browser-use.com')
+		self.base_url = base_url or CONFIG.BROWSER_USE_CLOUD_API_URL
 		self.enable_auth = enable_auth
 		self.auth_client = DeviceAuthClient(base_url=self.base_url) if enable_auth else None
-		self.pending_events: list[dict] = []
+		self.pending_events: list[BaseEvent] = []
 		self.auth_task = None
 		self.session_id: str | None = None
 
@@ -33,42 +33,30 @@ class CloudSync:
 		try:
 			# Extract session ID from CreateAgentSessionEvent
 			if event.event_type == 'CreateAgentSession' and hasattr(event, 'id'):
-				self.session_id = event.id
+				self.session_id = str(event.id)  # type: ignore
 
 				# Start authentication flow if enabled and not authenticated
 				if self.enable_auth and self.auth_client and not self.auth_client.is_authenticated:
 					# Start auth in background
 					self.auth_task = asyncio.create_task(self._background_auth(agent_session_id=self.session_id))
 
-			# Prepare event data
-			event_data = self._prepare_event_data(event)
-
 			# Send event to cloud
-			await self._send_event(event_data)
+			await self._send_event(event)
 
 		except Exception as e:
 			logger.error(f'Failed to handle {event.event_type} event: {type(e).__name__}: {e}', exc_info=True)
 
-	def _prepare_event_data(self, event: BaseEvent) -> dict:
-		"""Prepare event data for cloud API"""
-		# Get user_id from auth client or use temp ID
-		user_id = self.auth_client.user_id if self.auth_client else TEMP_USER_ID
-
-		# Set user_id directly on event (mutating the event)
-		# Use setattr to handle cases where user_id might not be a defined field
-		if hasattr(event, 'user_id') or hasattr(event, '__dict__'):
-			event.user_id = user_id
-		else:
-			logger.debug(f'Could not set user_id on event type {type(event).__name__}')
-
-		# Return event directly as dict
-		return event.model_dump(mode='json')
-
-	async def _send_event(self, event_data: dict) -> None:
+	async def _send_event(self, event: BaseEvent) -> None:
 		"""Send event to cloud API"""
 		try:
 			headers = {}
 
+			# override user_id on event with auth client user_id if available
+			if self.auth_client:
+				event.user_id = str(self.auth_client.user_id)  # type: ignore
+			else:
+				event.user_id = TEMP_USER_ID  # type: ignore
+
 			# Add auth headers if available
 			if self.auth_client:
 				headers.update(self.auth_client.get_headers())
@@ -76,29 +64,33 @@ class CloudSync:
 			# Send event (batch format with direct BaseEvent serialization)
 			async with httpx.AsyncClient() as client:
 				response = await client.post(
-					f'{self.base_url.rstrip("/")}/api/v1/events/',
-					json={'events': [event_data]},
+					f'{self.base_url.rstrip("/")}/api/v1/events',
+					json={'events': [event.model_dump(mode='json')]},
 					headers=headers,
 					timeout=10.0,
 				)
 
 				if response.status_code == 401 and self.auth_client and not self.auth_client.is_authenticated:
 					# Store event for retry after auth
-					self.pending_events.append(event_data)
+					self.pending_events.append(event)
 				elif response.status_code >= 400:
 					# Log error but don't raise - we want to fail silently
-					logger.warning(f'Failed to send event to cloud: HTTP {response.status_code} - {response.text[:200]}')
+					logger.warning(
+						f'Failed to send event to cloud: POST {response.request.url} {response.status_code} - {response.text}'
+					)
 		except httpx.TimeoutException:
-			logger.warning(f'Event send timed out after 10 seconds - event_type={event_data.get("event_type", "unknown")}')
+			logger.warning(f'⚠️ Event send timed out after 10 seconds: {event}')
 		except httpx.ConnectError as e:
-			logger.warning(f'Failed to connect to cloud service at {self.base_url}: {e}')
+			logger.warning(f'⚠️ Failed to connect to cloud service at {self.base_url}: {e}')
 		except httpx.HTTPError as e:
-			logger.warning(f'HTTP error sending event: {type(e).__name__}: {e}')
+			logger.warning(f'⚠️ HTTP error sending event {event}: {type(e).__name__}: {e}')
 		except Exception as e:
-			logger.warning(f'Unexpected error sending {event_data.get("event_type", "unknown")} event: {type(e).__name__}: {e}')
+			logger.warning(f'⚠️ Unexpected error sending event {event}: {type(e).__name__}: {e}')
 
 	async def _background_auth(self, agent_session_id: str) -> None:
 		"""Run authentication in background"""
+		assert self.auth_client, 'enable_auth=True must be set before calling CloudSync_background_auth()'
+		assert self.session_id, 'session_id must be set before calling CloudSync._background_auth() can fire'
 		try:
 			# Run authentication
 			success = await self.auth_client.authenticate(
@@ -121,15 +113,10 @@ class CloudSync:
 		if not self.pending_events:
 			return
 
-		# Update user_id in pending events
-		user_id = self.auth_client.user_id
-		for event_data in self.pending_events:
-			event_data['user_id'] = user_id
-
 		# Send all pending events
-		for event_data in self.pending_events:
+		for event in self.pending_events:
 			try:
-				await self._send_event(event_data)
+				await self._send_event(event)
 			except Exception as e:
 				logger.warning(f'Failed to resend pending event: {e}')
 
@@ -138,11 +125,13 @@ class CloudSync:
 	async def _update_wal_user_ids(self, session_id: str) -> None:
 		"""Update user IDs in WAL file after authentication"""
 		try:
-			from browser_use.utils import BROWSER_USE_CONFIG_DIR
+			assert self.auth_client, 'Cloud sync must be authenticated to update WAL user ID'
 
-			wal_path = BROWSER_USE_CONFIG_DIR / 'events' / f'{session_id}.jsonl'
+			wal_path = CONFIG.BROWSER_USE_CONFIG_DIR / 'events' / f'{session_id}.jsonl'
 			if not await anyio.Path(wal_path).exists():
-				return
+				raise FileNotFoundError(
+					f'CloudSync failed to update saved event user_ids after auth: Agent EventBus WAL file not found: {wal_path}'
+				)
 
 			# Read all events
 			events = []
diff --git a/browser_use/telemetry/service.py b/browser_use/telemetry/service.py
index 9b88785cd..234217464 100644
--- a/browser_use/telemetry/service.py
+++ b/browser_use/telemetry/service.py
@@ -6,11 +6,11 @@ from dotenv import load_dotenv
 from posthog import Posthog
 from uuid_extensions import uuid7str
 
-from browser_use.telemetry.views import BaseTelemetryEvent
-from browser_use.utils import singleton
-
 load_dotenv()
 
+from browser_use.config import CONFIG
+from browser_use.telemetry.views import BaseTelemetryEvent
+from browser_use.utils import singleton
 
 logger = logging.getLogger(__name__)
 
@@ -22,8 +22,7 @@ POSTHOG_EVENT_SETTINGS = {
 
 def xdg_cache_home() -> Path:
 	default = Path.home() / '.cache'
-	env_var = os.getenv('XDG_CACHE_HOME')
-	if env_var and (path := Path(env_var)).is_absolute():
+	if CONFIG.XDG_CACHE_HOME and (path := Path(CONFIG.XDG_CACHE_HOME)).is_absolute():
 		return path
 	return default
 
@@ -44,8 +43,8 @@ class ProductTelemetry:
 	_curr_user_id = None
 
 	def __init__(self) -> None:
-		telemetry_disabled = os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
-		self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug'
+		telemetry_disabled = not CONFIG.ANONYMIZED_TELEMETRY
+		self.debug_logging = CONFIG.BROWSER_USE_LOGGING_LEVEL == 'debug'
 
 		if telemetry_disabled:
 			self._posthog_client = None
diff --git a/browser_use/utils.py b/browser_use/utils.py
index 7e1d90055..fe91a7555 100644
--- a/browser_use/utils.py
+++ b/browser_use/utils.py
@@ -12,6 +12,11 @@ from sys import stderr
 from typing import Any, ParamSpec, TypeVar
 from urllib.parse import urlparse
 
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
 logger = logging.getLogger(__name__)
 
 # Import error types - these may need to be adjusted based on actual import paths
@@ -21,11 +26,10 @@ except ImportError:
 	OpenAIBadRequestError = None
 
 try:
-	from groq import BadRequestError as GroqBadRequestError
+	from groq import BadRequestError as GroqBadRequestError  # type: ignore[import-not-found]
 except ImportError:
 	GroqBadRequestError = None
-# Browser Use configuration directory
-BROWSER_USE_CONFIG_DIR = Path.home() / '.config' / 'browseruse'
+
 
 # Global flag to prevent duplicate exit messages
 _exiting = False
@@ -539,25 +543,25 @@ def handle_llm_error(e: Exception) -> tuple[dict[str, Any], Any | None]:
 	Handle LLM API errors and extract failed generation data when available.
 
 	Args:
-	    e: The exception that occurred during LLM API call
+		e: The exception that occurred during LLM API call
 
 	Returns:
-	    Tuple containing:
-	    - response: Dict with 'raw' and 'parsed' keys
-	    - parsed: Parsed data (None if extraction was needed)
+		Tuple containing:
+		- response: Dict with 'raw' and 'parsed' keys
+		- parsed: Parsed data (None if extraction was needed)
 
 	Raises:
-	    LLMException: If the error is not a recognized type with failed generation data
+		LLMException: If the error is not a recognized type with failed generation data
 	"""
 	# Handle OpenAI BadRequestError with failed_generation
 	if (
 		OpenAIBadRequestError
 		and isinstance(e, OpenAIBadRequestError)
 		and hasattr(e, 'body')
-		and e.body
-		and 'failed_generation' in e.body
+		and e.body  # type: ignore[attr-defined]
+		and 'failed_generation' in e.body  # type: ignore[operator]
 	):
-		raw = e.body['failed_generation']
+		raw = e.body['failed_generation']  # type: ignore[index]
 		response = {'raw': raw, 'parsed': None}
 		parsed = None
 		logger.debug(f'Failed to do tool call, trying to parse raw response: {raw}')
@@ -565,14 +569,16 @@ def handle_llm_error(e: Exception) -> tuple[dict[str, Any], Any | None]:
 
 	# Handle Groq BadRequestError with failed_generation
 	if (
-		GroqBadRequestError
+		GroqBadRequestError is not None
 		and isinstance(e, GroqBadRequestError)
 		and hasattr(e, 'body')
-		and e.body
-		and 'error' in e.body
-		and 'failed_generation' in e.body['error']
+		and e.body  # type: ignore[attr-defined]
+		and isinstance(e.body, dict)  # type: ignore[attr-defined]
+		and 'error' in e.body  # type: ignore[attr-defined]
+		and isinstance(e.body['error'], dict)  # type: ignore[attr-defined,index]
+		and 'failed_generation' in e.body['error']  # type: ignore[attr-defined,index]
 	):
-		raw = e.body['error']['failed_generation']  # type: ignore
+		raw = e.body['error']['failed_generation']  # type: ignore[attr-defined,index]
 		response = {'raw': raw, 'parsed': None}
 		parsed = None
 		logger.debug(f'Failed to do tool call, trying to parse raw response: {raw}')
@@ -599,7 +605,7 @@ def get_browser_use_version() -> str:
 				match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', content)
 				if match:
 					version = f'{match.group(1)}'
-					os.environ['LIBRARY_VERSION'] = version
+					os.environ['LIBRARY_VERSION'] = version  # used by bubus event_schema so all Event schemas include versioning
 					return version
 
 		# If pyproject.toml doesn't exist, try getting version from pip
diff --git a/docs/customize/browser-settings.mdx b/docs/customize/browser-settings.mdx
index f2d8762f0..05d6fa29f 100644
--- a/docs/customize/browser-settings.mdx
+++ b/docs/customize/browser-settings.mdx
@@ -30,8 +30,8 @@ agent = Agent('fill out the form on this page', browser_session=browser_session)
 
 ## `BrowserSession`
 
-- 🎭 `BrowserSession(**params)` is Browser Use's object that tracks a playwright connection to a running browser. It sets up:
-  - the `playwright` library, `browser` and/or `browser_context`, and `page` objects and tracks which tabs the agent & human are focused on
+- `BrowserSession(**params)` is Browser Use's object that tracks a connection to a running browser. It sets up:
+  - the `playwright`, `browser`, `browser_context`, and `page` objects and tracks which tabs the agent/human are focused on
   - methods to interact with the browser window, apply config needed by the Agent, and run the `DOMService` for element detection
   - it can take a `browser_profile=BrowserProfile(...)` template containing some config defaults, and `**kwargs` session-specific config overrides
 
@@ -271,28 +271,38 @@ Glob patterns are supported:
 disable_security: bool = False
 ```
 
-Completely disables all basic browser security features. Allows interacting across cross-site iFrames boundaries, but 
-
 <Warning>
-This option is very INSECURE and is only for niche use cases. DO NOT LET YOUR AGENT visit untrusted URLs or give it real cookies when `disable_security=True`.
-Visiting a single malicious site in this mode can trivially compromise *all* the cookies in the browser profile in under 1 second.
+⚠️ Setting this to `True` is NOT RECOMMENDED.  
+It completely disables all basic browser security features.
 </Warning>
 
+This option is for debugging and interacting across cross-origin iFrames when there are no cookies or sensitive data in use.
+It's very INSECURE, under no circumstances should you enable this while using real cookies or sensitive data, visiting a single untrusted URL in this mode can immediately compromise all the profile cookies instantly. Consider a less nuclear option like `bypass_csp=True` instead.
+
 #### `deterministic_rendering`
 
 ```python
 deterministic_rendering: bool = False
 ```
 
-Attempt to forced more deterministic rendering for consistent screenshots across different host operating systems and hardware.
-
-Disables OS-specific font hints, aliasing, GPU-accelerated rendering, normalizes DPI, and sets a specific JS random seed to try to avoid nondeterministic JS.
-
 <Warning>
-This flag is for niche use cases (e.g. screenshot diffing) where pixel-perfect rendering across different server operating systems is more important than stability.
-It makes the agent more likely to be blocked as a bot and triggers some glitchy behavior in chrome occasionally, it's not recommended unless you know you need it.
+⚠️ Setting this to `True` is NOT RECOMMENDED.  
+It can be glitchy & slow, and it increases chances of getting blocked by anti-bot systems. It's mostly useful for QA applications.
 </Warning>
 
+
+
+It's a shortcut for adding these launch args:
+
+- `--deterministic-mode`
+- `--js-flags=--random-seed=1157259159`
+- `--force-color-profile=srgb`
+- `--font-render-hinting=none`
+- `--force-device-scale-factor=2`
+- `--enable-webgl`
+
+With these options fonts will look slightly worse than macOS and slightly than Windows, but rendering will be more consistent between OSs and runs. The cost is performance and stability. Software rendering is slower, easier to fingerprint as a bot, and sometimes glitchy. You likely *don't need this option* unless you're trying to do screenshot diffing.
+
 #### `highlight_elements`
 
 ```python
@@ -388,7 +398,23 @@ No need to set this unless you have multiple profiles set up in a single `user_d
 window_position: dict | None = {"width": 0, "height": 0}
 ```
 
-Window position from top-left.
+Window position from top-left corner.
+
+#### `save_recording_path`
+
+```python
+save_recording_path: str | None = None
+```
+
+Directory path for saving video recordings.
+
+#### `trace_path`
+
+```python
+trace_path: str | None = None
+```
+
+Directory path for saving Agent trace files. Files are automatically named as `{trace_path}/{context_id}.zip`.
 
 
 ---
@@ -550,7 +576,7 @@ These control how the browser waits for CDP API calls to complete and pages to l
 default_timeout: float | None = None
 ```
 
-Default timeout for Playwright operations in milliseconds.
+Default timeout for Playwright operations in milliseconds (e.g. `10000` if you want 10s).
 
 #### `default_navigation_timeout`
 
@@ -558,7 +584,7 @@ Default timeout for Playwright operations in milliseconds.
 default_navigation_timeout: float | None = None
 ```
 
-Default timeout for page navigation in milliseconds.
+Default timeout for page navigation in milliseconds (e.g. `30000` if you want 30s).
 
 
 ### Playwright Viewport Options
@@ -571,7 +597,7 @@ Configure browser window size, viewport, and display properties:
 user_agent: str | None = None
 ```
 
-Specific user agent to use in this context.
+Specific user agent to use in this context. See [`playwright.devices`](https://playwright.dev/python/docs/emulation).
 
 #### `is_mobile`
 
@@ -603,7 +629,7 @@ Geolocation coordinates. Example: `{"latitude": 59.95, "longitude": 30.31667}`
 locale: str | None = None
 ```
 
-Specify user locale, for example en-GB, de-DE, etc. Locale will affect the navigator.language value, Accept-Language request header value as well as number and date formatting rules.
+Specify user locale, for example `en-GB`, `de-DE`, etc. Locale will affect the `navigator.language` value, `Accept-Language` request header value as well as number and date formatting rules.
 
 #### `timezone_id`
 
@@ -611,7 +637,7 @@ Specify user locale, for example en-GB, de-DE, etc. Locale will affect the navig
 timezone_id: str | None = None
 ```
 
-Timezone identifier (e.g., 'America/New_York').
+Timezone identifier (e.g. `'America/New_York'` or `'UTC'`).
 
 #### `window_size`
 
@@ -646,7 +672,7 @@ A viewport is *always* used in headless mode regardless of this setting, and is
 device_scale_factor: float | None = None
 ```
 
-Device scale factor (DPI). Useful for high-resolution screenshots (set it to 2).
+Device scale factor (DPI). Useful for high-resolution screenshots (set it to 2 or 3).
 
 #### `screen`
 
@@ -743,7 +769,11 @@ Whether to ignore HTTPS errors when sending network requests.
 bypass_csp: bool = False
 ```
 
-Toggles bypassing Content-Security-Policy.
+<Warning>
+Enabling this can increase security risk and makes the bot very easy to fingerprint. (Cloudflare, Datadome, etc. will block you)
+</Warning>
+
+Toggles bypassing Content-Security-Policy. Enabling reduces some CSP-related errors that can arise from automation scripts injected into pages with strict policies that forbid inline scripts.
 
 #### `java_script_enabled`
 
@@ -751,6 +781,10 @@ Toggles bypassing Content-Security-Policy.
 java_script_enabled: bool = True
 ```
 
+<Warning>
+Not recommended, untested with Browser Use and likely breaks things.
+</Warning>
+
 Whether or not to enable JavaScript in the context.
 
 #### `service_workers`
diff --git a/eval/judge_system.py b/eval/judge_system.py
new file mode 100644
index 000000000..8a3157b5b
--- /dev/null
+++ b/eval/judge_system.py
@@ -0,0 +1,564 @@
+"""
+@file purpose: Comprehensive judge system for evaluating browser-use agent runs with detailed structured feedback.
+
+This system provides multi-dimensional evaluation of agent performance including:
+- Task analysis and categorization
+- Trajectory quality assessment
+- Tool usage effectiveness
+- Agent reasoning quality
+- Browser handling capabilities
+- Structured error categorization
+- Actionable improvement suggestions
+
+The judge uses vision-language models to analyze agent execution history, screenshots,
+and final results to provide detailed structured JSON feedback for developers.
+"""
+
+import asyncio
+import base64
+import io
+import json
+import logging
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class ErrorCategory(Enum):
+	# Access & Authentication
+	BLOCKED_ACCESS = 'blocked_access'
+	CAPTCHA_CHALLENGE = 'captcha_challenge'
+	LOGIN_REQUIRED = 'login_required'
+	RATE_LIMITED = 'rate_limited'
+
+	# Tool & Action Failures
+	TOOL_MISUSE = 'tool_misuse'
+	INVALID_PARAMETERS = 'invalid_parameters'
+	ACTION_SEQUENCE_ERROR = 'action_sequence_error'
+
+	# Agent Behavior Issues
+	INFINITE_LOOP = 'infinite_loop'
+	STUCK_PATTERN = 'stuck_pattern'
+	POOR_PLANNING = 'poor_planning'
+	CONTEXT_LOSS = 'context_loss'
+
+	# Browser & Technical
+	ELEMENT_NOT_FOUND = 'element_not_found'
+	CLICK_FAILURE = 'click_failure'
+	LOAD_TIMEOUT = 'load_timeout'
+	JAVASCRIPT_ERROR = 'javascript_error'
+
+	# Content & Understanding
+	MISUNDERSTOOD_TASK = 'misunderstood_task'
+	FORMAT_ERROR = 'format_error'
+	CONTENT_PARSING_ERROR = 'content_parsing_error'
+
+	# Enhanced Detection Categories
+	NAVIGATION_CONFUSION = 'navigation_confusion'
+	FORM_FILLING_ERROR = 'form_filling_error'
+	MODAL_HANDLING = 'modal_handling'
+	IFRAME_ISSUES = 'iframe_issues'
+	BROWSER_CRASHES = 'browser_crashes'
+	IMPOSSIBLE_TASK = 'impossible_task'
+	MISSING_INFORMATION = 'missing_information'
+
+
+class TaskCategory(Enum):
+	EXTRACTION = 'extraction'
+	INTERACTION = 'interaction'
+	LOGIN = 'login'
+	RESEARCH = 'research'
+	SHOPPING = 'shopping'
+	BOOKING = 'booking'
+	COMPARISON = 'comparison'
+	QA_TESTING = 'qa_testing'
+	FORM_FILLING = 'form_filling'
+	NAVIGATION = 'navigation'
+	SEARCH = 'search'
+	FILTERING = 'filtering'
+	CONTENT_CREATION = 'content_creation'
+	FILE_OPERATIONS = 'file_operations'
+	MULTI_STEP_WORKFLOW = 'multi_step_workflow'
+
+
+@dataclass
+class ScoreBreakdown:
+	trajectory_quality: int  # How human-like is the solution path (1-100)
+	tool_calling_effectiveness: int  # How well do tools work (1-100)
+	agent_reasoning: int  # Quality of agent's decision making (1-100)
+	browser_handling: int  # Browser stability and error handling (1-100)
+	task_satisfaction: int  # Final user satisfaction (1-100)
+
+
+@dataclass
+class JudgeResult:
+	# Basic Information
+	task_summary: str  # 1 sentence summary
+	task_clarity_score: int  # How clear vs uncertain the task is (1-100)
+	task_categories: list[TaskCategory]  # Primary task categories
+
+	# Analysis
+	reasoning: str  # What went well/not well analysis
+	error_categories: list[ErrorCategory]  # Core error categories identified
+
+	# Scores
+	scores: ScoreBreakdown
+	final_score: int  # Overall score (1-100)
+	passed: bool  # Whether it meets 70% threshold
+
+	# Developer Feedback
+	improvement_tips: list[str]  # Concrete improvement suggestions
+	critical_issues: list[str]  # Must-fix issues
+
+	# Metadata
+	evaluation_timestamp: str
+
+
+def encode_image(image_path: str) -> str:
+	"""Convert image file to base64 string."""
+	try:
+		with Image.open(image_path) as image:
+			if image.mode == 'RGBA':
+				image = image.convert('RGB')
+			buffered = io.BytesIO()
+			image.save(buffered, format='JPEG')
+			return base64.b64encode(buffered.getvalue()).decode('utf-8')
+	except Exception as e:
+		logger.error(f'Failed to encode image {image_path}: {e}')
+		return ''
+
+
+def truncate_text(text: str, max_length: int) -> str:
+	"""Truncate text to maximum length with ellipsis."""
+	if len(text) <= max_length:
+		return text
+	return text[: max_length - 3] + '...'
+
+
+def prepare_agent_steps(complete_history: list[dict]) -> list[str]:
+	"""Extract and format agent steps, limiting each to 2000 characters."""
+	steps = []
+	for i, step in enumerate(complete_history):
+		step_text = f'Step {i + 1}:\n'
+
+		# Add model output if available
+		if step.get('model_output'):
+			model_output = step['model_output']
+			if isinstance(model_output, dict):
+				# Format the model output nicely
+				if 'action' in model_output:
+					step_text += f'Actions: {json.dumps(model_output["action"], indent=1)}\n'
+				if 'current_state' in model_output:
+					step_text += f'State: {model_output["current_state"]}\n'
+
+		# Add results if available
+		if step.get('result'):
+			for j, result in enumerate(step['result']):
+				if isinstance(result, dict):
+					if result.get('extracted_content'):
+						step_text += f'Result {j + 1}: {result["extracted_content"]}\n'
+					if result.get('error'):
+						step_text += f'Error {j + 1}: {result["error"]}\n'
+
+		# Add URL info
+		if step.get('state', {}).get('url'):
+			step_text += f'URL: {step["state"]["url"]}\n'
+
+		# Truncate to 2000 characters
+		steps.append(truncate_text(step_text, 2000))
+
+	return steps
+
+
+async def comprehensive_judge(
+	task: str,
+	complete_history: list[dict],
+	final_result: str,
+	screenshot_paths: list[str],
+	model: BaseChatModel,
+	max_images: int = 10,
+) -> JudgeResult:
+	"""
+	Comprehensive judge that evaluates browser-use agent runs with detailed structured feedback.
+	"""
+
+	# Prepare inputs with length limits
+	task_truncated = truncate_text(task, 40000)
+	final_result_truncated = truncate_text(final_result or 'No final result', 40000)
+	agent_steps = prepare_agent_steps(complete_history)
+
+	# Select last N images
+	selected_images = screenshot_paths[-max_images:] if screenshot_paths else []
+
+	# Encode images
+	encoded_images = []
+	for img_path in selected_images:
+		if Path(img_path).exists():
+			encoded_img = encode_image(img_path)
+			if encoded_img:
+				encoded_images.append(
+					{
+						'type': 'image_url',
+						'image_url': {
+							'url': f'data:image/jpeg;base64,{encoded_img}',
+							'detail': 'high',
+						},
+					}
+				)
+
+	# Construct the evaluation prompt
+	system_prompt = """You are an expert judge evaluating browser automation agent performance. 
+
+Your task is to comprehensively analyze the agent's execution and provide structured feedback.
+
+**EVALUATION CRITERIA:**
+
+1. **Task Analysis**: Understand what the user wanted to accomplish
+2. **Trajectory Quality**: How human-like and efficient was the solution path?
+3. **Tool Usage**: How effectively were browser automation tools used?
+4. **Agent Reasoning**: Quality of decision-making and problem-solving
+5. **Browser Handling**: How well were browser issues handled?
+6. **Final Outcome**: Did the task satisfy the user's intent?
+
+**ERROR CATEGORIES TO CONSIDER:**
+- Access & Authentication: blocked_access, captcha_challenge, login_required, rate_limited
+- Tool & Action Failures: tool_misuse, invalid_parameters, action_sequence_error
+- Agent Behavior: infinite_loop, stuck_pattern, poor_planning, context_loss
+- Browser & Technical: element_not_found, click_failure, load_timeout, javascript_error
+- Content & Understanding: misunderstood_task, format_error, content_parsing_error
+- Enhanced: navigation_confusion, form_filling_error, modal_handling, iframe_issues, browser_crashes, impossible_task, missing_information
+
+
+**TASK CATEGORIES TO CONSIDER:**
+extraction, interaction, login, research, shopping, booking, comparison, qa_testing, form_filling, navigation, search, filtering, content_creation, file_operations, multi_step_workflow
+- You can use multiple categories for the same task.
+- You can also add other categories if they fit better.
+
+**TASK CLARITY SCORE:**
+- is the task very clear step by step like a recipe (high score) or very vague and uncertain (low score)
+
+**IMPROVEMENT TIPS:**
+- Think how to get this task done better. Create actionable tips - but they should be understandable for a developer who does not know the task.
+- These tips will be avg across many tasks and then the most common / problemetic will be used to improve the browser-use agent.
+- In browser-use we convert websites to text so that the agent can understand it. In there we mark interactive elements with [index] and then the agent can chose to interact with them and we click then the actual css selector. Sometimes this conversion is not perfect.
+- After the agent takes an action it gets the new state and its previous thinking, and outputs the next action. Which we then execute again.
+- So we can improve the agent system prompt, input context, tool calls to interact with the browser, or our extraction layer to convert the website to text.
+- always first mention the error this would fix and then the improvement tip.
+
+**SCORING SCALE:**
+- 90-100: Excellent execution, human-like, minimal issues
+- 80-89: Good execution with minor issues
+- 70-79: Acceptable execution, some problems but functional
+- 60-69: Poor execution with significant issues
+- 1-59: Failed execution, major problems
+
+**PASS THRESHOLD: 70%**
+
+Respond with EXACTLY this JSON structure (no additional text):
+
+{
+    "task_summary": "One sentence summary of what the task was trying to accomplish",
+    "task_categories": ["category1", "category2"],
+    "task_clarity_score": 85,
+    "reasoning": "Detailed analysis of what went well and what didn't, trajectory quality, planning assessment",
+    "error_categories": ["error1", "error2"],
+    "scores": {
+        "trajectory_quality": 75,
+        "tool_calling_effectiveness": 80,
+        "agent_reasoning": 85,
+        "browser_handling": 65,
+        "task_satisfaction": 70
+    },
+    "final_score": 75,
+    "critical_issues": [
+        "Critical issue that must be fixed 1",
+        "Critical issue that must be fixed 2"
+    ],
+    "improvement_tips": [
+        "Specific actionable improvement 1",
+        "Specific actionable improvement 2"
+    ]
+}"""
+
+	user_prompt = f"""**TASK:** {task_truncated}
+
+**AGENT EXECUTION STEPS:**
+{chr(10).join(agent_steps)}
+
+**FINAL RESULT:**
+{final_result_truncated}
+
+**TOTAL STEPS:** {len(complete_history)}
+**SCREENSHOTS PROVIDED:** {len(selected_images)}
+
+Analyze this execution and respond with the exact JSON structure requested."""
+
+	# Build messages
+	content_parts = [{'type': 'text', 'text': user_prompt}]
+	content_parts.extend(encoded_images)
+
+	messages = [
+		{'role': 'system', 'content': system_prompt},
+		{'role': 'user', 'content': content_parts},
+	]
+
+	# Get structured response
+	try:
+		response = await asyncio.to_thread(model.invoke, messages)
+
+		# Parse the JSON response
+		# Handle both string and list content types
+		if isinstance(response.content, list):
+			response_text = str(response.content[0]) if response.content else ''
+		else:
+			response_text = str(response.content)
+		response_text = response_text.strip()
+
+		# Try to extract JSON if wrapped in markdown
+		if '```json' in response_text:
+			json_start = response_text.find('```json') + 7
+			json_end = response_text.find('```', json_start)
+			if json_end != -1:
+				response_text = response_text[json_start:json_end].strip()
+		elif '```' in response_text:
+			json_start = response_text.find('```') + 3
+			json_end = response_text.find('```', json_start)
+			if json_end != -1:
+				response_text = response_text[json_start:json_end].strip()
+
+		# Parse JSON
+		try:
+			result_dict = json.loads(response_text)
+		except json.JSONDecodeError as e:
+			logger.error(f'Failed to parse JSON response: {e}')
+			logger.error(f'Response text: {response_text}')
+			# Create fallback result
+			return create_fallback_result(task, 'Failed to parse judge response')
+
+		# Convert to structured result
+		return parse_judge_response(result_dict, task)
+
+	except Exception as e:
+		logger.error(f'Judge evaluation failed: {e}')
+		return create_fallback_result(task, str(e))
+
+
+def parse_judge_response(result_dict: dict, task: str) -> JudgeResult:
+	"""Parse the LLM response into a structured JudgeResult."""
+	try:
+		# Parse task categories
+		task_categories = []
+		if 'task_categories' in result_dict:
+			for cat in result_dict['task_categories']:
+				try:
+					task_categories.append(TaskCategory(cat))
+				except ValueError:
+					logger.warning(f'Unknown task category: {cat}')
+
+		# Parse error categories
+		error_categories = []
+		if 'error_categories' in result_dict:
+			for err in result_dict['error_categories']:
+				try:
+					error_categories.append(ErrorCategory(err))
+				except ValueError:
+					logger.warning(f'Unknown error category: {err}')
+
+		# Parse scores
+		scores_dict = result_dict.get('scores', {})
+		scores = ScoreBreakdown(
+			trajectory_quality=scores_dict.get('trajectory_quality', 50),
+			tool_calling_effectiveness=scores_dict.get('tool_calling_effectiveness', 50),
+			agent_reasoning=scores_dict.get('agent_reasoning', 50),
+			browser_handling=scores_dict.get('browser_handling', 50),
+			task_satisfaction=scores_dict.get('task_satisfaction', 50),
+		)
+
+		final_score = result_dict.get('final_score', 50)
+
+		return JudgeResult(
+			task_summary=result_dict.get('task_summary', 'Task analysis unavailable'),
+			task_clarity_score=result_dict.get('task_clarity_score', 50),
+			task_categories=task_categories,
+			reasoning=result_dict.get('reasoning', 'Analysis unavailable'),
+			error_categories=error_categories,
+			scores=scores,
+			final_score=final_score,
+			passed=final_score >= 70,
+			improvement_tips=result_dict.get('improvement_tips', []),
+			critical_issues=result_dict.get('critical_issues', []),
+			evaluation_timestamp=datetime.now().isoformat(),
+		)
+
+	except Exception as e:
+		logger.error(f'Failed to parse judge response: {e}')
+		return create_fallback_result(task, 'Failed to parse structured response')
+
+
+def create_fallback_result(task: str, error_msg: str) -> JudgeResult:
+	"""Create a fallback result when evaluation fails."""
+	return JudgeResult(
+		task_summary=f'Failed to analyze task: {task[:100]}...',
+		task_clarity_score=0,
+		task_categories=[TaskCategory.QA_TESTING],
+		reasoning=f'Evaluation failed: {error_msg}',
+		error_categories=[ErrorCategory.IMPOSSIBLE_TASK],
+		scores=ScoreBreakdown(
+			trajectory_quality=0,
+			tool_calling_effectiveness=0,
+			agent_reasoning=0,
+			browser_handling=0,
+			task_satisfaction=0,
+		),
+		final_score=0,
+		passed=False,
+		improvement_tips=['Fix evaluation system'],
+		critical_issues=[f'Evaluation system failure: {error_msg}'],
+		evaluation_timestamp=datetime.now().isoformat(),
+	)
+
+
+async def judge_with_retry(
+	task: str,
+	complete_history: list[dict],
+	final_result: str,
+	screenshot_paths: list[str],
+	model: BaseChatModel,
+	max_retries: int = 3,
+	max_images: int = 10,
+) -> JudgeResult:
+	"""
+	Judge with retry logic for robustness.
+	"""
+	for attempt in range(max_retries):
+		try:
+			return await comprehensive_judge(
+				task,
+				complete_history,
+				final_result,
+				screenshot_paths,
+				model,
+				max_images,
+			)
+		except Exception as e:
+			if attempt == max_retries - 1:
+				logger.error(f'Judge failed after {max_retries} attempts: {e}')
+				return create_fallback_result(task, str(e))
+			logger.warning(f'Judge attempt {attempt + 1} failed, retrying: {e}')
+			await asyncio.sleep(2**attempt)
+
+	# Fallback return (should never reach here given the logic above, but ensures type safety)
+	return create_fallback_result(task, 'Max retries exceeded without proper error handling')
+
+
+def get_example_json_structure() -> dict:
+	"""Get an example of the expected JSON response structure for the LLM judge."""
+	return {
+		'task_summary': 'Extract product prices from an e-commerce website',
+		'task_clarity_score': 85,
+		'task_categories': ['extraction', 'research'],
+		'reasoning': 'The agent successfully navigated to the target website and extracted most product information. However, it had difficulty with dynamic loading elements and missed some prices that loaded asynchronously. The overall approach was logical but could benefit from better wait strategies.',
+		'error_categories': ['element_not_found', 'load_timeout'],
+		'scores': {
+			'trajectory_quality': 75,
+			'tool_calling_effectiveness': 80,
+			'agent_reasoning': 85,
+			'browser_handling': 65,
+			'task_satisfaction': 70,
+		},
+		'final_score': 75,
+		'critical_issues': [
+			'Missing wait for dynamic content to load',
+			'No fallback strategy when primary selectors fail',
+		],
+		'improvement_tips': [
+			'Browser not loaded: Implement better wait strategies for dynamic content',
+			'Element not found: Add retry logic for element detection',
+			'No error message: Improve error handling for the tool click element',
+		],
+	}
+
+
+def _read_result_file(result_file: Path) -> dict[str, Any]:
+	"""Helper function to read result file synchronously."""
+	with open(result_file) as f:
+		return json.load(f)
+
+
+def _write_result_file(result_file: Path, result_data: dict[str, Any]) -> None:
+	"""Helper function to write result file synchronously."""
+	with open(result_file, 'w') as f:
+		f.write(json.dumps(result_data, indent=2, default=str))
+
+
+# Integration helper function
+async def evaluate_task_with_comprehensive_judge(task_folder: Path, model: BaseChatModel, max_images: int = 10) -> dict[str, Any]:
+	"""
+	Evaluate a task result using the comprehensive judge system.
+
+	Returns a dictionary with both the old format for compatibility
+	and the new comprehensive analysis.
+	"""
+	result_file = task_folder / 'result.json'
+	if not result_file.exists():
+		return {
+			'task_id': task_folder.name,
+			'comprehensive_judge': None,
+			'error': 'No result.json found',
+		}
+
+	try:
+		# Load existing result using async wrapper
+		result_data = await asyncio.to_thread(_read_result_file, result_file)
+
+		# Check if comprehensive judge result already exists
+		if result_data.get('comprehensive_judge_evaluation'):
+			return {
+				'task_id': task_folder.name,
+				'comprehensive_judge': result_data['comprehensive_judge_evaluation'],
+				'error': None,
+			}
+
+		# Extract data for evaluation
+		task = result_data.get('task', 'Unknown task')
+		complete_history = result_data.get('complete_history', [])
+		final_result = result_data.get('final_result_response', '')
+		screenshot_paths = result_data.get('screenshot_paths', [])
+
+		# Run comprehensive evaluation
+		judge_result = await judge_with_retry(
+			task=task,
+			complete_history=complete_history,
+			final_result=final_result,
+			screenshot_paths=screenshot_paths,
+			model=model,
+			max_images=max_images,
+		)
+
+		# Convert to dict for storage
+		judge_dict = asdict(judge_result)
+
+		# Save back to result file using async wrapper
+		result_data['comprehensive_judge_evaluation'] = judge_dict
+		await asyncio.to_thread(_write_result_file, result_file, result_data)
+
+		return {
+			'task_id': task_folder.name,
+			'comprehensive_judge': judge_dict,
+			'error': None,
+		}
+
+	except Exception as e:
+		logger.error(f'Comprehensive judge evaluation failed for {task_folder.name}: {e}')
+		return {
+			'task_id': task_folder.name,
+			'comprehensive_judge': None,
+			'error': str(e),
+		}
diff --git a/eval/service.py b/eval/service.py
index 20b9b2e50..05f155ddd 100644
--- a/eval/service.py
+++ b/eval/service.py
@@ -1,3 +1,4 @@
+# pyright: reportMissingImports=false
 # ==============================================================================================================
 # Documentation for this evaluation file.
 
@@ -40,23 +41,224 @@
 # ==============================================================================================================
 import asyncio
 import base64
+import gc
 import io
 import logging
 import re
-import shutil
+import signal
+import sys
+import threading
+import time
+from uuid import UUID
 
 import anyio
+import psutil
 from lmnr import AsyncLaminarClient, Laminar, observe
 from PIL import Image
 
 MAX_IMAGE = 5
 
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s: %(message)s')
 logger = logging.getLogger(__name__)
 
 Laminar.initialize()
 laminar_client = AsyncLaminarClient()
 
+# Global variables for resource monitoring
+_resource_monitor_task = None
+_resource_monitor_stop_event = None
+_graceful_shutdown_initiated = False
+
+
+def get_system_resources():
+	"""Get current system resource usage"""
+	try:
+		# Memory usage
+		memory = psutil.virtual_memory()
+		memory_percent = memory.percent
+		memory_available_gb = memory.available / (1024**3)
+
+		# CPU usage
+		cpu_percent = psutil.cpu_percent(interval=1)
+
+		# Load average (Unix only)
+		try:
+			load_avg = psutil.getloadavg()
+			load_1min = load_avg[0]
+		except (AttributeError, OSError):
+			load_1min = 0.0
+
+		# Process count
+		process_count = len(psutil.pids())
+
+		# Chrome/Browser processes
+		chrome_processes = []
+		python_processes = []
+		for proc in psutil.process_iter(['pid', 'name', 'memory_percent', 'cpu_percent']):
+			try:
+				name = proc.info['name'].lower()
+				if 'chrome' in name or 'chromium' in name:
+					chrome_processes.append(proc.info)
+				elif 'python' in name:
+					python_processes.append(proc.info)
+			except (psutil.NoSuchProcess, psutil.AccessDenied):
+				continue
+
+		return {
+			'memory_percent': memory_percent,
+			'memory_available_gb': memory_available_gb,
+			'cpu_percent': cpu_percent,
+			'load_1min': load_1min,
+			'process_count': process_count,
+			'chrome_process_count': len(chrome_processes),
+			'python_process_count': len(python_processes),
+			'chrome_processes': chrome_processes[:5],  # Top 5 chrome processes
+			'python_processes': python_processes[:5],  # Top 5 python processes
+		}
+	except Exception as e:
+		logger.warning(f'Failed to get system resources: {type(e).__name__}: {e}')
+		return {
+			'memory_percent': 0,
+			'memory_available_gb': 0,
+			'cpu_percent': 0,
+			'load_1min': 0,
+			'process_count': 0,
+			'chrome_process_count': 0,
+			'python_process_count': 0,
+			'chrome_processes': [],
+			'python_processes': [],
+		}
+
+
+def log_system_resources(context: str = ''):
+	"""Log current system resource usage"""
+	resources = get_system_resources()
+	logger.info(f'=== SYSTEM RESOURCES {context} ===')
+	logger.info(f'Memory: {resources["memory_percent"]:.1f}% used, {resources["memory_available_gb"]:.2f}GB available')
+	logger.info(f'CPU: {resources["cpu_percent"]:.1f}%, Load: {resources["load_1min"]:.2f}')
+	logger.info(
+		f'Processes: {resources["process_count"]} total, {resources["chrome_process_count"]} Chrome, {resources["python_process_count"]} Python'
+	)
+
+	if resources['chrome_processes']:
+		logger.info('Top Chrome processes:')
+		for proc in resources['chrome_processes']:
+			logger.info(
+				f'  PID {proc["pid"]}: {proc["name"]} - CPU: {proc["cpu_percent"]:.1f}%, Memory: {proc["memory_percent"]:.1f}%'
+			)
+
+	logger.info('=' * (20 + len(context)))
+
+
+async def start_resource_monitoring(interval: int = 30):
+	"""Start background resource monitoring"""
+	global _resource_monitor_task, _resource_monitor_stop_event
+
+	if _resource_monitor_task is not None:
+		logger.warning('Resource monitoring is already running')
+		return
+
+	_resource_monitor_stop_event = asyncio.Event()
+
+	async def monitor_loop():
+		"""Background monitoring loop"""
+		logger.info(f'Starting resource monitoring (interval: {interval}s)')
+		try:
+			while _resource_monitor_stop_event is not None and not _resource_monitor_stop_event.is_set():
+				try:
+					log_system_resources('MONITOR')
+
+					# Check for concerning resource levels
+					resources = get_system_resources()
+					if resources['memory_percent'] > 85:
+						logger.warning(f'⚠️ HIGH MEMORY USAGE: {resources["memory_percent"]:.1f}%')
+					if resources['cpu_percent'] > 90:
+						logger.warning(f'⚠️ HIGH CPU USAGE: {resources["cpu_percent"]:.1f}%')
+					if resources['chrome_process_count'] > 20:
+						logger.warning(f'⚠️ HIGH CHROME PROCESS COUNT: {resources["chrome_process_count"]}')
+
+					# Force garbage collection periodically
+					if resources['memory_percent'] > 70:
+						logger.info('Running garbage collection due to high memory usage')
+						gc.collect()
+
+				except Exception as e:
+					logger.error(f'Error in resource monitoring: {type(e).__name__}: {e}')
+
+				try:
+					if _resource_monitor_stop_event is not None:
+						await asyncio.wait_for(_resource_monitor_stop_event.wait(), timeout=interval)
+					else:
+						await asyncio.sleep(interval)
+					break  # Event was set, exit loop
+				except TimeoutError:
+					continue  # Timeout reached, continue monitoring
+		except Exception as e:
+			logger.error(f'Resource monitoring loop crashed: {type(e).__name__}: {e}')
+		finally:
+			logger.info('Resource monitoring stopped')
+
+	_resource_monitor_task = asyncio.create_task(monitor_loop())
+
+
+async def stop_resource_monitoring():
+	"""Stop background resource monitoring"""
+	global _resource_monitor_task, _resource_monitor_stop_event
+
+	if _resource_monitor_stop_event is not None:
+		_resource_monitor_stop_event.set()
+
+	if _resource_monitor_task is not None:
+		try:
+			await asyncio.wait_for(_resource_monitor_task, timeout=5.0)
+		except TimeoutError:
+			logger.warning('Resource monitoring task did not stop gracefully')
+			_resource_monitor_task.cancel()
+			try:
+				await _resource_monitor_task
+			except asyncio.CancelledError:
+				pass
+
+		_resource_monitor_task = None
+		_resource_monitor_stop_event = None
+
+
+def setup_signal_handlers():
+	"""Setup signal handlers for graceful shutdown"""
+	global _graceful_shutdown_initiated
+
+	def signal_handler(signum, frame):
+		global _graceful_shutdown_initiated
+		if _graceful_shutdown_initiated:
+			logger.critical('🔥 FORCE EXIT: Second signal received, terminating immediately')
+			sys.exit(1)
+
+		_graceful_shutdown_initiated = True
+		logger.warning(f'⚠️ GRACEFUL SHUTDOWN: Received signal {signum}, initiating graceful shutdown...')
+		log_system_resources('SHUTDOWN')
+
+		# Try to stop resource monitoring
+		try:
+			loop = asyncio.get_event_loop()
+			if loop.is_running():
+				loop.create_task(stop_resource_monitoring())
+		except Exception as e:
+			logger.error(f'Failed to stop resource monitoring during shutdown: {e}')
+
+		# Give some time for cleanup, then force exit
+		def force_exit():
+			time.sleep(10)
+			if _graceful_shutdown_initiated:
+				logger.critical('🔥 FORCE EXIT: Graceful shutdown timeout, terminating')
+				sys.exit(1)
+
+		threading.Thread(target=force_exit, daemon=True).start()
+
+	# Register signal handlers
+	signal.signal(signal.SIGINT, signal_handler)
+	signal.signal(signal.SIGTERM, signal_handler)
+
 
 def encode_image(image):
 	"""Convert a PIL image to base64 string."""
@@ -279,19 +481,183 @@ import http.client
 import json
 import os
 import subprocess
-import time
-from datetime import datetime
+from dataclasses import dataclass, field
+
+# Define Stage enum and related classes for the pipeline
+from enum import Enum
 from pathlib import Path
+from typing import Any
 
 import requests
 from dotenv import load_dotenv
+
+# Import the new comprehensive judge system (conditional import for backwards compatibility)
+try:
+	from judge_system import evaluate_task_with_comprehensive_judge
+
+	COMPREHENSIVE_JUDGE_AVAILABLE = True
+except ImportError:
+	logger.warning('Comprehensive judge system not available. Only Mind2Web judge will be available.')
+	COMPREHENSIVE_JUDGE_AVAILABLE = False
+
+	async def evaluate_task_with_comprehensive_judge(*args, **kwargs) -> dict[str, Any]:
+		"""Fallback function when comprehensive judge system is not available"""
+		raise ImportError('Comprehensive judge system not available')
+
+
+class Stage(Enum):
+	SETUP_BROWSER = 'setup_browser'
+	RUN_AGENT = 'run_agent'
+	FORMAT_HISTORY = 'format_history'
+	EVALUATE = 'evaluate'
+	SAVE_SERVER = 'save_server'
+
+
+@dataclass
+class StageError:
+	stage: Stage
+	error_type: str
+	message: str
+
+
+@dataclass
+class TaskResult:
+	task_id: str
+	run_id: str
+	confirmed_task: str
+	task: Any
+	max_steps: int
+	laminar_link: str | None = None
+	completed_stages: set[Stage] = field(default_factory=set)
+	stage_data: dict[Stage, Any] = field(default_factory=dict)
+	errors: list = field(default_factory=list)
+	cancelled: bool = False
+	critical_error: str | None = None
+	server_save_failed: bool = False
+
+	def stage_completed(self, stage: Stage, data: Any = None):
+		self.completed_stages.add(stage)
+		if data is not None:
+			self.stage_data[stage] = data
+
+	def stage_failed(self, stage: Stage, error: StageError):
+		self.errors.append(error)
+
+	def mark_cancelled(self):
+		self.cancelled = True
+
+	def mark_critical_error(self, error: str):
+		self.critical_error = error
+
+	def mark_server_save_failed(self, error: str):
+		self.server_save_failed = True
+		self.errors.append(StageError(Stage.SAVE_SERVER, 'server_save', error))
+
+	def has_execution_data(self) -> bool:
+		return Stage.RUN_AGENT in self.completed_stages or Stage.FORMAT_HISTORY in self.completed_stages
+
+	@property
+	def server_payload(self) -> dict[str, Any]:
+		"""Generate payload for server submission"""
+		payload = {
+			'taskId': self.task_id,
+			'runId': self.run_id,
+			'task': self.confirmed_task,
+			'completed_stages': [stage.value for stage in self.completed_stages],
+			'has_errors': len(self.errors) > 0,
+			'cancelled': self.cancelled,
+			'critical_error': self.critical_error,
+			'server_save_failed': self.server_save_failed,
+			'laminarTaskLink': self.laminar_link,
+		}
+
+		# Add task execution data if available
+		if Stage.FORMAT_HISTORY in self.completed_stages:
+			format_data = self.stage_data.get(Stage.FORMAT_HISTORY, {})
+			payload.update(
+				{
+					'actionHistory': format_data.get('action_history', []),
+					'finalResultResponse': format_data.get('final_result_response', ''),
+					'selfReportCompleted': format_data.get('self_report_completed', False),
+					'selfReportSuccess': format_data.get('self_report_success', False),
+					'taskDuration': format_data.get('task_duration'),
+					'steps': format_data.get('steps'),
+					'maxSteps': self.max_steps,
+					'tokensUsed': format_data.get('tokensUsed'),
+					'completeHistory': format_data.get('complete_history', []),  # Add complete step history
+				}
+			)
+
+		# Add evaluation data if available
+		if Stage.EVALUATE in self.completed_stages:
+			eval_data = self.stage_data.get(Stage.EVALUATE, {})
+
+			# Handle comprehensive judge evaluation
+			comp_eval = eval_data.get('comprehensive_evaluation') or eval_data.get('comprehensive_judge')
+			if comp_eval:
+				# Convert enum lists to string lists for database storage
+				task_categories = comp_eval.get('task_categories', [])
+				if task_categories and hasattr(task_categories[0], 'value'):
+					task_categories = [cat.value for cat in task_categories]
+
+				error_categories = comp_eval.get('error_categories', [])
+				if error_categories and hasattr(error_categories[0], 'value'):
+					error_categories = [err.value for err in error_categories]
+
+				payload.update(
+					{
+						'comprehensiveJudgeEvaluationSummary': comp_eval.get('task_summary'),
+						'comprehensiveJudgeEvaluationReasoning': comp_eval.get('reasoning'),
+						'comprehensiveJudgeEvaluationPassed': comp_eval.get('passed'),
+						'comprehensiveJudgeEvaluationScore': comp_eval.get('final_score'),
+						'comprehensiveJudgeEvaluationCategories': task_categories,
+						'comprehensiveJudgeEvaluationErrors': error_categories,
+						'comprehensiveJudgeEvaluationTips': comp_eval.get('improvement_tips', []),
+						'comprehensiveJudgeEvaluationCriticalIssues': comp_eval.get('critical_issues', []),
+						'comprehensiveJudgeEvaluationScores': comp_eval.get('scores'),
+						'comprehensiveJudgeEvaluationFull': comp_eval,  # Include full comprehensive eval data
+					}
+				)
+
+			# Handle legacy Mind2Web evaluation (for compatibility)
+			payload.update(
+				{
+					'onlineMind2WebEvaluationJudgement': eval_data.get('judgement'),
+					'onlineMind2WebEvaluationError': eval_data.get('error'),
+					'onlineMind2WebEvaluationSuccess': eval_data.get('success', False),
+					'onlineMind2WebEvaluationScore': eval_data.get('score', 0.0),
+				}
+			)
+
+		# Ensure all data in payload is JSON serializable
+		serialized_payload = make_json_serializable(payload)
+		# Type assertion since we know payload is a dict and make_json_serializable preserves dict structure
+		assert isinstance(serialized_payload, dict), 'Payload serialization should preserve dict structure'
+		return serialized_payload
+
+	def get_local_status(self) -> dict[str, Any]:
+		"""Get local status summary"""
+		success = (
+			Stage.EVALUATE in self.completed_stages
+			and not self.cancelled
+			and self.critical_error is None
+			and len([e for e in self.errors if e.error_type == 'exception']) == 0
+		)
+		return {
+			'task_id': self.task_id,
+			'success': success,
+			'error': self.critical_error or (self.errors[0].message if self.errors else None),
+			'completed_stages': [stage.value for stage in self.completed_stages],
+		}
+
+
 from langchain_anthropic import ChatAnthropic
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_openai import ChatOpenAI
 from pydantic.types import SecretStr
 
-from browser_use import ActionResult, Agent, BrowserSession, Controller
+from browser_use import ActionResult, Agent, BrowserProfile, BrowserSession, Controller
 from browser_use.agent.memory import MemoryConfig
 from browser_use.agent.views import AgentHistoryList
 
@@ -537,6 +903,27 @@ def clean_action_dict(action_dict: dict) -> dict:
 	return {k: clean_action_dict(v) if isinstance(v, dict) else v for k, v in action_dict.items() if v is not None}
 
 
+def make_json_serializable(obj: Any) -> Any:
+	"""
+	Convert objects to JSON-serializable types.
+	Handles common non-serializable types like enums, custom objects, etc.
+	"""
+	if obj is None:
+		return None
+	elif isinstance(obj, (str, int, float, bool)):
+		return obj
+	elif isinstance(obj, dict):
+		return {k: make_json_serializable(v) for k, v in obj.items()}
+	elif isinstance(obj, (list, tuple)):
+		return [make_json_serializable(item) for item in obj]
+	elif hasattr(obj, 'value'):  # Handle enums
+		return obj.value
+	elif hasattr(obj, '__dict__'):  # Handle custom objects
+		return str(obj)
+	else:
+		return str(obj)
+
+
 async def reformat_agent_history(
 	agent_history: AgentHistoryList,
 	task_id: str,
@@ -704,12 +1091,16 @@ class Task:
 		return self.__str__()
 
 
-async def judge_task_result(model, task_folder: Path, score_threshold: float = 3) -> dict:
+async def judge_task_result(model, task_folder: Path, score_threshold: float = 3, use_mind2web: bool = False) -> dict:
 	"""
-	Judge a single task result based on the success value of the final action.
+	Judge a single task result using the comprehensive judge system by default,
+	with optional fallback to the original Online_Mind2Web evaluation.
 
 	Args:
+	    model: The model to use for evaluation
 	    task_folder: Path to the task result folder
+	    score_threshold: Score threshold for image filtering (used only for Mind2Web)
+	    use_mind2web: If True, use the original Online_Mind2Web evaluation instead
 
 	Returns:
 	    Dictionary containing judgment results
@@ -722,51 +1113,132 @@ async def judge_task_result(model, task_folder: Path, score_threshold: float = 3
 		async with await anyio.open_file(result_file) as f:
 			result = json.loads(await f.read())
 
-		# If a Online_Mind2Web_evaluation is already saved, we can skip the eval
-		if result.get('Online_Mind2Web_evaluation'):
-			return result.get('Online_Mind2Web_evaluation')
+		# Check if we should use the original Mind2Web evaluation
+		if use_mind2web:
+			logger.info(f'Task {task_folder.name}: Using original Online_Mind2Web evaluation')
 
-		# Get the screenshot paths, task description, and action history
-		screenshot_paths = result.get('screenshot_paths', [])
-		task_description = result.get('task')
-		action_history = result.get('action_history', [])
+			# If a Online_Mind2Web_evaluation is already saved, we can skip the eval
+			if result.get('Online_Mind2Web_evaluation'):
+				return result.get('Online_Mind2Web_evaluation')
 
-		# Use the retry wrapper for evaluation
-		try:
-			# Await the async function directly instead of using asyncio.run()
-			eval_result = await Online_Mind2Web_eval_with_retry(
-				task_description, action_history, screenshot_paths, model, score_threshold
-			)
+			# Get the screenshot paths, task description, and action history
+			screenshot_paths = result.get('screenshot_paths', [])
+			task_description = result.get('task')
+			action_history = result.get('action_history', [])
 
-			if eval_result is None:
-				raise Exception('Evaluation failed after all retries')
+			# Use the retry wrapper for evaluation
+			try:
+				# Await the async function directly instead of using asyncio.run()
+				eval_result = await Online_Mind2Web_eval_with_retry(
+					task_description, action_history, screenshot_paths, model, score_threshold
+				)
 
-			messages, text, system_msg, record, key_points = eval_result
+				if eval_result is None:
+					raise Exception('Evaluation failed after all retries')
 
-			# Final steps to get judgement - run invoke in a thread
-			judgement_msg = await asyncio.to_thread(model.invoke, messages)
-			judgement = judgement_msg.content
+				messages, text, system_msg, record, key_points = eval_result
 
-			if 'success' in judgement.lower().split('status:')[1]:  # This is the official criteria for success
-				evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': True, 'error': None, 'score': 1.0}
-			else:  # This is the official criteria for failure
-				evaluation = {'task_id': task_folder.name, 'judgement': judgement, 'success': False, 'error': None, 'score': 0.0}
+				# Final steps to get judgement - run invoke in a thread
+				judgement_msg = await asyncio.to_thread(model.invoke, messages)
+				judgement = judgement_msg.content
 
-			# Save the Online_Mind2Web_evaluation into the result.json file
-			result['Online_Mind2Web_evaluation'] = evaluation
-			async with await anyio.open_file(result_file, 'w') as f:
-				await f.write(json.dumps(result, indent=2))
+				if 'success' in judgement.lower().split('status:')[1]:  # This is the official criteria for success
+					evaluation = {
+						'task_id': task_folder.name,
+						'judgement': judgement,
+						'success': True,
+						'error': None,
+						'score': 1.0,
+					}
+				else:  # This is the official criteria for failure
+					evaluation = {
+						'task_id': task_folder.name,
+						'judgement': judgement,
+						'success': False,
+						'error': None,
+						'score': 0.0,
+					}
 
-			return evaluation
+				# Save the Online_Mind2Web_evaluation into the result.json file
+				result['Online_Mind2Web_evaluation'] = evaluation
+				async with await anyio.open_file(result_file, 'w') as f:
+					await f.write(json.dumps(result, indent=2))
 
-		except Exception as err:
-			return {
-				'task_id': task_folder.name,
-				'judgement': None,
-				'success': False,
-				'error': f'{type(err).__name__}: {err}',
-				'score': 0.0,
-			}
+				return evaluation
+
+			except Exception as err:
+				return {
+					'task_id': task_folder.name,
+					'judgement': None,
+					'success': False,
+					'error': f'{type(err).__name__}: {err}',
+					'score': 0.0,
+				}
+
+		else:
+			# Use the new comprehensive judge system (default)
+			logger.info(f'Task {task_folder.name}: Using comprehensive judge evaluation')
+
+			# Check if comprehensive judge is available
+			if not COMPREHENSIVE_JUDGE_AVAILABLE:
+				logger.warning(f'Task {task_folder.name}: Comprehensive judge not available, falling back to Mind2Web')
+				return await judge_task_result(model, task_folder, score_threshold, use_mind2web=True)
+
+			# Check if comprehensive judge result already exists
+			if result.get('comprehensive_judge_evaluation'):
+				existing_eval = result['comprehensive_judge_evaluation']
+				return {
+					'task_id': task_folder.name,
+					'judgement': existing_eval.get('reasoning', 'Comprehensive evaluation completed'),
+					'success': existing_eval.get('passed', False),
+					'error': None,
+					'score': existing_eval.get('final_score', 0) / 100.0,  # Convert to 0-1 scale
+					'comprehensive_evaluation': existing_eval,
+				}
+
+			try:
+				# Run comprehensive judge evaluation
+				comprehensive_result = await evaluate_task_with_comprehensive_judge(
+					task_folder=task_folder, model=model, max_images=10
+				)
+
+				if comprehensive_result.get('error'):
+					return {
+						'task_id': task_folder.name,
+						'judgement': None,
+						'success': False,
+						'error': comprehensive_result['error'],
+						'score': 0.0,
+					}
+
+				comp_eval = comprehensive_result.get('comprehensive_judge')
+				if comp_eval:
+					return {
+						'task_id': task_folder.name,
+						'judgement': comp_eval.get('reasoning', 'Comprehensive evaluation completed'),
+						'success': comp_eval.get('passed', False),
+						'error': None,
+						'score': comp_eval.get('final_score', 0) / 100.0,  # Convert to 0-1 scale
+						'comprehensive_evaluation': comp_eval,
+					}
+				else:
+					return {
+						'task_id': task_folder.name,
+						'judgement': None,
+						'success': False,
+						'error': 'Comprehensive judge failed to return results',
+						'score': 0.0,
+					}
+
+			except Exception as err:
+				logger.error(f'Comprehensive judge evaluation failed for {task_folder.name}: {err}')
+				return {
+					'task_id': task_folder.name,
+					'judgement': None,
+					'success': False,
+					'error': f'Comprehensive judge error: {type(err).__name__}: {err}',
+					'score': 0.0,
+				}
 
 	except Exception as err:
 		return {
@@ -778,275 +1250,6 @@ async def judge_task_result(model, task_folder: Path, score_threshold: float = 3
 		}
 
 
-def calculate_local_summary(results_dir: str | None = None) -> dict:
-	"""
-	Calculates a summary of task results by reading the saved result.json files.
-	Does not make any network requests.
-	"""
-	if results_dir is None:
-		results_dir = 'saved_trajectories'
-
-	path = Path(results_dir)
-	if not path.is_dir():
-		logger.warning(f'Results directory {results_dir} does not exist')
-		return {
-			'timestamp': datetime.now().isoformat(),
-			'total_tasks': 0,
-			'successful_tasks': 0,
-			'failed_tasks': 0,
-			'success_rate': 0,
-			'average_score': 0,
-		}
-
-	# Collect all task folders
-	task_folders = [f for f in path.iterdir() if f.is_dir()]
-	total_tasks = len(task_folders)
-	successful_tasks = 0
-	total_score = 0.0
-	results_with_score = 0
-
-	for folder in task_folders:
-		result_file = folder / 'result.json'
-		if result_file.exists():
-			try:
-				with open(result_file) as f:
-					result_data = json.load(f)
-
-				# Look for evaluation data
-				evaluation = result_data.get('Online_Mind2Web_evaluation', {})
-				if evaluation:
-					if evaluation.get('success', False):
-						successful_tasks += 1
-
-					score = evaluation.get('score', 0.0)
-					if score > 0:
-						total_score += score
-						results_with_score += 1
-			except Exception as e:
-				logger.error(f'Error reading result file {result_file}: {type(e).__name__}: {e}')
-
-	# Calculate statistics
-	failed_tasks = total_tasks - successful_tasks
-	success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0
-	average_score = total_score / results_with_score if results_with_score > 0 else 0
-
-	return {
-		'timestamp': datetime.now().isoformat(),
-		'total_tasks': total_tasks,
-		'successful_tasks': successful_tasks,
-		'failed_tasks': failed_tasks,
-		'success_rate': success_rate,
-		'average_score': average_score,
-	}
-
-
-from dataclasses import dataclass
-from enum import Enum
-from typing import Any
-
-
-class Stage(Enum):
-	LOAD_EXISTING = 'load_existing'
-	SETUP_BROWSER = 'setup_browser'
-	RUN_AGENT = 'run_agent'
-	FORMAT_HISTORY = 'format_history'
-	EVALUATE = 'evaluate'
-	SAVE_SERVER = 'save_server'
-
-
-@dataclass
-class StageError(Exception):
-	stage: Stage
-	error_type: str  # "timeout", "cancelled", "exception"
-	message: str
-
-
-class TaskResult:
-	"""Simplified task state tracker with auto-updating server payload"""
-
-	def __init__(
-		self,
-		task_id: str,
-		run_id: str,
-		task_description: str,
-		task: Task,
-		max_steps: int,
-		laminar_task_link: str | None = None,
-	):
-		self.task_id = task_id
-		self.completed_stages = set()
-		self.stage_data = {}  # Store actual results from each stage
-		self.failed_stages = {}  # Store errors from failed stages
-		self.local_error = None
-
-		# Initialize server payload with defaults
-		self.server_payload = {
-			'runId': run_id,
-			'taskId': task_id,
-			'task': task_description,
-			'taskWebsite': task.website,
-			'taskReferenceLength': task.reference_length,
-			'taskLevel': task.level,
-			'taskClusterId': task.cluster_id,
-			'actionHistory': [],
-			'finalResultResponse': 'None',
-			'selfReportCompleted': False,
-			'selfReportSuccess': None,
-			'browserCrash': False,
-			'browserCrashReason': None,
-			'onlineMind2WebEvaluationJudgement': 'Not Attempted',
-			'onlineMind2WebEvaluationError': None,
-			'onlineMind2WebEvaluationSuccess': False,
-			'onlineMind2WebEvaluationScore': 0.0,
-			'completeHistory': [],
-			'maxSteps': max_steps,
-			'tokensUsed': 0,
-			'taskDuration': None,
-			'steps': 0,
-			'laminarTaskLink': laminar_task_link,  # Add field for task-specific Laminar link
-		}
-
-	def stage_completed(self, stage: Stage, data: Any = None):
-		"""Mark stage as completed and update server payload"""
-		self.completed_stages.add(stage)
-		if data is not None:
-			self.stage_data[stage] = data
-		self._auto_update_payload()
-
-	def stage_failed(self, stage: Stage, error: StageError):
-		"""Mark stage as failed and update server payload"""
-		self.failed_stages[stage] = error
-		self._auto_update_payload()
-
-	def has_execution_data(self) -> bool:
-		"""Check if we have execution data from either loading existing or completing execution"""
-		return Stage.LOAD_EXISTING in self.completed_stages or Stage.FORMAT_HISTORY in self.completed_stages
-
-	def execution_succeeded(self) -> bool:
-		"""Check if execution pipeline succeeded"""
-		return (
-			Stage.LOAD_EXISTING in self.completed_stages or Stage.FORMAT_HISTORY in self.completed_stages
-		) and not self._has_execution_failures()
-
-	def _has_execution_failures(self) -> bool:
-		"""Check if any execution-related stages failed"""
-		execution_stages = {Stage.SETUP_BROWSER, Stage.RUN_AGENT, Stage.FORMAT_HISTORY}
-		return any(stage in self.failed_stages for stage in execution_stages)
-
-	def _auto_update_payload(self):
-		"""Automatically update server_payload based on current state"""
-		# Update execution data if available
-		if Stage.LOAD_EXISTING in self.completed_stages:
-			existing_data = self.stage_data[Stage.LOAD_EXISTING]
-			self.server_payload.update(
-				{
-					'actionHistory': existing_data.get('action_history', []),
-					'finalResultResponse': existing_data.get('final_result_response', 'None'),
-					'selfReportCompleted': existing_data.get('self_report_completed', False),
-					'selfReportSuccess': existing_data.get('self_report_success', None),
-					'completeHistory': existing_data.get('complete_history', []),
-					'taskDuration': existing_data.get('task_duration'),
-					'steps': existing_data.get('steps', 0),
-					'tokensUsed': existing_data.get('tokensUsed', 0),
-				}
-			)
-		elif Stage.FORMAT_HISTORY in self.completed_stages:
-			formatted_data = self.stage_data[Stage.FORMAT_HISTORY]
-			self.server_payload.update(
-				{
-					'actionHistory': formatted_data.get('action_history', []),
-					'finalResultResponse': formatted_data.get('final_result_response', 'None'),
-					'selfReportCompleted': formatted_data.get('self_report_completed', False),
-					'selfReportSuccess': formatted_data.get('self_report_success', None),
-					'completeHistory': formatted_data.get('complete_history', []),
-					'taskDuration': formatted_data.get('task_duration'),
-					'steps': formatted_data.get('steps', 0),
-					'tokensUsed': formatted_data.get('tokensUsed', 0),
-				}
-			)
-
-		# Update evaluation data if available
-		if Stage.EVALUATE in self.completed_stages:
-			eval_data = self.stage_data[Stage.EVALUATE]
-			judgement = eval_data.get('judgement')
-			self.server_payload.update(
-				{
-					'onlineMind2WebEvaluationJudgement': judgement if judgement is not None else 'None',
-					'onlineMind2WebEvaluationError': eval_data.get('error'),
-					'onlineMind2WebEvaluationSuccess': eval_data.get('success', False),
-					'onlineMind2WebEvaluationScore': eval_data.get('score', 0.0),
-				}
-			)
-
-		# Update failure states
-		self._update_failure_states()
-
-	def _update_failure_states(self):
-		"""Update server payload based on failed stages"""
-		# Check for browser/execution failures
-		for stage, error in self.failed_stages.items():
-			if stage in {Stage.SETUP_BROWSER, Stage.RUN_AGENT}:
-				self.server_payload['browserCrash'] = True
-				if error.error_type == 'timeout':
-					self.server_payload['browserCrashReason'] = f'{stage.value} timed out: {error.message}'
-				elif error.error_type == 'cancelled':
-					self.server_payload['browserCrashReason'] = f'{stage.value} was cancelled: {error.message}'
-				else:
-					self.server_payload['browserCrashReason'] = f'{stage.value} failed: {error.message}'
-
-			# Update evaluation failures
-			elif stage == Stage.EVALUATE:
-				if error.error_type == 'timeout':
-					self.server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Timed Out'
-					self.server_payload['onlineMind2WebEvaluationError'] = 'Evaluation process timed out'
-				elif error.error_type == 'cancelled':
-					self.server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Cancelled'
-					self.server_payload['onlineMind2WebEvaluationError'] = 'Evaluation was cancelled'
-				else:
-					self.server_payload['onlineMind2WebEvaluationJudgement'] = 'Evaluation Process Error'
-					self.server_payload['onlineMind2WebEvaluationError'] = f'Evaluation Error: {error.message}'
-
-	def mark_cancelled(self):
-		"""Mark task as cancelled"""
-		self.server_payload.update(
-			{
-				'finalResultResponse': 'Task was cancelled',
-				'onlineMind2WebEvaluationJudgement': 'Task Cancelled',
-				'onlineMind2WebEvaluationError': 'Task was cancelled',
-				'onlineMind2WebEvaluationSuccess': False,
-				'onlineMind2WebEvaluationScore': 0.0,
-			}
-		)
-		self.local_error = 'Task cancelled'
-
-	def mark_critical_error(self, error_msg: str):
-		"""Mark task as having critical error"""
-		self.server_payload.update(
-			{
-				'finalResultResponse': f'Critical Error: {error_msg}',
-				'onlineMind2WebEvaluationJudgement': 'Critical System Error',
-				'onlineMind2WebEvaluationError': f'Critical flow error: {error_msg}',
-				'onlineMind2WebEvaluationSuccess': False,
-				'onlineMind2WebEvaluationScore': 0.0,
-			}
-		)
-		self.local_error = f'Critical flow error: {error_msg}'
-
-	def mark_server_save_failed(self, error_msg: str):
-		"""Mark server save as failed"""
-		if self.local_error:
-			self.local_error += f'; Server save failed: {error_msg}'
-		else:
-			self.local_error = f'Server save failed: {error_msg}'
-
-	def get_local_status(self) -> dict:
-		"""Return local processing status"""
-		success = self.execution_succeeded() and (
-			Stage.EVALUATE in self.completed_stages or not self.has_execution_data() or Stage.EVALUATE in self.failed_stages
-		)
-		return {'task_id': self.task_id, 'success': success and not self.local_error, 'error': self.local_error}
-
-
 async def run_stage(stage: Stage, stage_func, timeout: int | None = None):
 	"""Generic stage runner with timeout"""
 	if timeout:
@@ -1054,35 +1257,17 @@ async def run_stage(stage: Stage, stage_func, timeout: int | None = None):
 	return await stage_func()
 
 
-async def load_existing_result(task_folder: Path) -> dict:
-	"""Load existing result if available"""
-	result_file = task_folder / 'result.json'
-	if not result_file.exists():
-		raise FileNotFoundError('No existing result found')
-
-	async with await anyio.open_file(result_file) as f:
-		existing_result = json.loads(await f.read())
-
-	# Check if evaluation is also present
-	existing_eval = existing_result.get('Online_Mind2Web_evaluation')
-	if existing_eval:
-		existing_result['has_evaluation'] = True
-		existing_result['evaluation_data'] = existing_eval
-	else:
-		existing_result['has_evaluation'] = False
-
-	return existing_result
-
-
-async def setup_browser_session(task: Task, headless: bool) -> BrowserSession:
+async def setup_browser_session(task: Task, headless: bool, highlight_elements: bool = True) -> BrowserSession:
 	"""Setup browser session for the task"""
 	logger.debug(f'Browser setup: Initializing BrowserSession for task {task.task_id}')
 
 	# Use incognito mode (user_data_dir=None) for evaluations to avoid state pollution
-	browser_session = BrowserSession(
+	profile = BrowserProfile(
 		user_data_dir=None,  # Incognito mode - no persistent state
 		headless=headless,
 		chromium_sandbox=False,  # running in docker
+		highlight_elements=highlight_elements,  # Control element highlighting (passed to profile)
+		keep_alive=True,
 		# higher timeouts = higher success rates on long tail of slow sites or if on a slow CI server
 		# timeout=60_000,
 		# default_timeout=60_000,
@@ -1093,6 +1278,8 @@ async def setup_browser_session(task: Task, headless: bool) -> BrowserSession:
 		# ignore_https_errors=True,  # some eval tasks have http:// or broken https sites in them
 	)
 
+	browser_session = BrowserSession(browser_profile=profile)
+
 	# Start browser session
 	logger.debug(f'Browser setup: Starting browser session for task {task.task_id}')
 	await browser_session.start()
@@ -1107,7 +1294,7 @@ async def setup_browser_session(task: Task, headless: bool) -> BrowserSession:
 	return browser_session
 
 
-@observe(name='executor', span_type='EXECUTOR')
+@observe(name='executor', span_type='EXECUTOR')  # type: ignore[arg-type]
 async def run_agent_with_browser(
 	browser_session: BrowserSession,
 	task: Task,
@@ -1150,10 +1337,10 @@ async def run_agent_with_browser(
 	return agent.state.history
 
 
-@observe(name='evaluate_task_result', span_type='EVALUATOR')
-async def evaluate_task_result(eval_model: BaseChatModel, task_folder: Path) -> dict:
+@observe(name='evaluate_task_result', span_type='EVALUATOR')  # type: ignore[arg-type]
+async def evaluate_task_result(eval_model: BaseChatModel, task_folder: Path, use_mind2web: bool = False) -> dict:
 	"""Evaluate the task result"""
-	return await judge_task_result(eval_model, task_folder, score_threshold=3)
+	return await judge_task_result(eval_model, task_folder, score_threshold=3, use_mind2web=use_mind2web)
 
 
 def save_result_to_server(convex_url: str, secret_key: str, payload: dict) -> bool:
@@ -1165,7 +1352,7 @@ async def cleanup_browser_safe(browser_session: BrowserSession):
 	"""Safe browser cleanup with timeout"""
 	try:
 		logger.debug('Browser cleanup: Starting close operation for session')
-		await asyncio.wait_for(browser_session.close(), timeout=30)
+		await asyncio.wait_for(browser_session.kill(), timeout=30)
 		logger.debug('Browser cleanup: Close operation completed successfully')
 	except TimeoutError:
 		logger.warning('Browser cleanup: Timed out after 30 seconds')
@@ -1185,18 +1372,16 @@ def determine_current_stage(completed_stages: set) -> Stage:
 		return Stage.RUN_AGENT
 	elif Stage.SETUP_BROWSER in completed_stages:
 		return Stage.SETUP_BROWSER
-	elif Stage.LOAD_EXISTING in completed_stages:
-		return Stage.LOAD_EXISTING
 	else:
-		return Stage.LOAD_EXISTING  # Default starting stage
+		return Stage.SETUP_BROWSER  # Default starting stage
 
 
-@observe(name='evaluation', span_type='EVALUATION')
+@observe(name='evaluation', span_type='EVALUATION')  # type: ignore[arg-type]
 async def run_task_with_semaphore(
 	task: Task,
 	run_id: str,
-	lmnr_run_id: str,
-	laminar_eval_link: str,
+	lmnr_run_id: str | None,
+	laminar_eval_link: str | None,
 	convex_url: str,
 	secret_key: str,
 	eval_model: BaseChatModel,
@@ -1205,7 +1390,6 @@ async def run_task_with_semaphore(
 	headless: bool,
 	use_vision: bool,
 	semaphore_runs: asyncio.Semaphore,  # Pass semaphore as argument
-	fresh_start: bool = True,
 	use_serp: bool = False,
 	enable_memory: bool = False,
 	memory_interval: int = 10,
@@ -1214,11 +1398,24 @@ async def run_task_with_semaphore(
 	planner_llm: BaseChatModel | None = None,
 	planner_interval: int = 1,
 	include_result: bool = False,
+	highlight_elements: bool = True,
+	use_mind2web_judge: bool = False,
 ) -> dict:
 	"""Clean pipeline approach for running tasks"""
-	logger.info(f'Task {task.task_id}: Waiting to acquire semaphore (current value: ~{semaphore_runs._value})')
+	task_start_time = time.time()
+	logger.info(f'🚀 Task {task.task_id}: Starting execution pipeline')
+	logger.info(f'📊 Task {task.task_id}: Waiting to acquire semaphore (current available: ~{semaphore_runs._value})')
+	log_system_resources(f'TASK_START_{task.task_id}')
+
+	semaphore_acquired_time = None
 	async with semaphore_runs:
-		logger.info(f'Task {task.task_id}: Semaphore acquired (remaining slots: ~{semaphore_runs._value})')
+		semaphore_acquired_time = time.time()
+		wait_time = semaphore_acquired_time - task_start_time
+		logger.info(
+			f'✅ Task {task.task_id}: Semaphore acquired after {wait_time:.2f}s (remaining slots: ~{semaphore_runs._value})'
+		)
+		log_system_resources(f'SEMAPHORE_ACQUIRED_{task.task_id}')
+
 		task_result = None
 		browser_session = None
 		laminar_task_link = None
@@ -1228,7 +1425,7 @@ async def run_task_with_semaphore(
 			if lmnr_run_id:
 				try:
 					datapoint_id = await laminar_client.evals.create_datapoint(
-						eval_id=lmnr_run_id,
+						eval_id=UUID(lmnr_run_id),
 						data={
 							'task_id': task.task_id,
 							'confirmed_task': task.confirmed_task,
@@ -1269,100 +1466,85 @@ async def run_task_with_semaphore(
 
 			logger.info(f'Task {task.task_id}: Starting execution pipeline.')
 			try:
-				# Stage 1: Try to load existing result
+				agent_history = None  # Initialize to track agent execution
+
+				# Stage 1: Setup browser
 				try:
-					existing_data = await run_stage(Stage.LOAD_EXISTING, lambda: load_existing_result(task_folder))
-					task_result.stage_completed(Stage.LOAD_EXISTING, existing_data)
+					logger.info(f'Task {task.task_id}: Browser setup starting.')
+					browser_session = await run_stage(
+						Stage.SETUP_BROWSER, lambda: setup_browser_session(task, headless, highlight_elements), timeout=120
+					)
+					task_result.stage_completed(Stage.SETUP_BROWSER)
+					logger.info(f'Task {task.task_id}: Browser session started successfully.')
+				except Exception as e:
+					error = StageError(Stage.SETUP_BROWSER, 'exception', str(e))
+					task_result.stage_failed(Stage.SETUP_BROWSER, error)
+					logger.error(f'Task {task.task_id}: Browser setup failed: {str(e)}')
+					# Continue to server save instead of early return
 
-					# If evaluation is also present, mark it as completed
-					if existing_data.get('has_evaluation'):
-						task_result.stage_completed(Stage.EVALUATE, existing_data['evaluation_data'])
-
-					logger.info(f'Task {task.task_id}: Successfully loaded existing result. Skipping execution.')
-
-				except Exception:
-					# No existing result, need to execute full pipeline
-					logger.info(f'Task {task.task_id}: No existing result found. Starting execution pipeline.')
-
-					agent_history = None  # Initialize to track agent execution
-
-					# Stage 2: Setup browser
+				# Stage 2: Run agent
+				if browser_session:  # Only run agent if browser setup succeeded
 					try:
-						logger.info(f'Task {task.task_id}: Browser setup starting.')
-						browser_session = await run_stage(
-							Stage.SETUP_BROWSER, lambda: setup_browser_session(task, headless), timeout=120
+						logger.info(f'Task {task.task_id}: Agent run starting.')
+
+						agent_history = await run_stage(
+							Stage.RUN_AGENT,
+							lambda: run_agent_with_browser(
+								browser_session,
+								task,
+								llm,
+								max_steps_per_task,
+								use_vision,
+								use_serp,
+								enable_memory,
+								memory_interval,
+								max_actions_per_step,
+								validate_output,
+								planner_llm,
+								planner_interval,
+							),
+							timeout=1000,
 						)
-						task_result.stage_completed(Stage.SETUP_BROWSER)
-						logger.info(f'Task {task.task_id}: Browser session started successfully.')
+
+						task_result.stage_completed(Stage.RUN_AGENT)
+						logger.info(f'Task {task.task_id}: Agent run completed.')
 					except Exception as e:
-						error = StageError(Stage.SETUP_BROWSER, 'exception', str(e))
-						task_result.stage_failed(Stage.SETUP_BROWSER, error)
-						logger.error(f'Task {task.task_id}: Browser setup failed: {str(e)}')
+						error = StageError(Stage.RUN_AGENT, 'exception', str(e))
+						task_result.stage_failed(Stage.RUN_AGENT, error)
+						logger.error(f'Task {task.task_id}: Agent run failed: {str(e)}')
 						# Continue to server save instead of early return
 
-					# Stage 3: Run agent
-					if browser_session:  # Only run agent if browser setup succeeded
-						try:
-							logger.info(f'Task {task.task_id}: Agent run starting.')
+				# Stage 3: Format history
+				if agent_history is not None:  # Only format if agent ran successfully
+					try:
+						logger.info(f'Task {task.task_id}: History formatting starting.')
+						formatted_data = await run_stage(
+							Stage.FORMAT_HISTORY,
+							lambda: reformat_agent_history(
+								agent_history, task.task_id, run_id, task.confirmed_task, include_result=include_result
+							),
+						)
+						task_result.stage_completed(Stage.FORMAT_HISTORY, formatted_data)
+						logger.info(f'Task {task.task_id}: Agent history formatted.')
+					except Exception as e:
+						error = StageError(Stage.FORMAT_HISTORY, 'exception', str(e))
+						task_result.stage_failed(Stage.FORMAT_HISTORY, error)
+						logger.error(f'Task {task.task_id}: History formatting failed: {str(e)}')
+						# Continue to server save instead of early return
 
-							agent_history = await run_stage(
-								Stage.RUN_AGENT,
-								lambda: run_agent_with_browser(
-									browser_session,
-									task,
-									llm,
-									max_steps_per_task,
-									use_vision,
-									use_serp,
-									enable_memory,
-									memory_interval,
-									max_actions_per_step,
-									validate_output,
-									planner_llm,
-									planner_interval,
-								),
-								timeout=600,
-							)
-
-							task_result.stage_completed(Stage.RUN_AGENT)
-							logger.info(f'Task {task.task_id}: Agent run completed.')
-						except Exception as e:
-							error = StageError(Stage.RUN_AGENT, 'exception', str(e))
-							task_result.stage_failed(Stage.RUN_AGENT, error)
-							logger.error(f'Task {task.task_id}: Agent run failed: {str(e)}')
-							# Continue to server save instead of early return
-
-					# Stage 4: Format history
-					if agent_history is not None:  # Only format if agent ran successfully
-						try:
-							logger.info(f'Task {task.task_id}: History formatting starting.')
-							formatted_data = await run_stage(
-								Stage.FORMAT_HISTORY,
-								lambda: reformat_agent_history(
-									agent_history, task.task_id, run_id, task.confirmed_task, include_result=include_result
-								),
-							)
-							task_result.stage_completed(Stage.FORMAT_HISTORY, formatted_data)
-							logger.info(f'Task {task.task_id}: Agent history formatted.')
-						except Exception as e:
-							error = StageError(Stage.FORMAT_HISTORY, 'exception', str(e))
-							task_result.stage_failed(Stage.FORMAT_HISTORY, error)
-							logger.error(f'Task {task.task_id}: History formatting failed: {str(e)}')
-							# Continue to server save instead of early return
-
-				# Stage 5: Evaluate (if we have execution data and no existing evaluation)
+				# Stage 4: Evaluate (if we have execution data and no existing evaluation)
 				if task_result.has_execution_data() and Stage.EVALUATE not in task_result.completed_stages:
 					try:
 						logger.info(f'Task {task.task_id}: Evaluation starting.')
 						evaluation = await run_stage(
-							Stage.EVALUATE, lambda: evaluate_task_result(eval_model, task_folder), timeout=300
+							Stage.EVALUATE, lambda: evaluate_task_result(eval_model, task_folder, use_mind2web_judge), timeout=300
 						)
 						task_result.stage_completed(Stage.EVALUATE, evaluation)
 						logger.info(f'Task {task.task_id}: Evaluation completed.')
 
 						if lmnr_run_id and datapoint_id:
 							await laminar_client.evals.update_datapoint(
-								eval_id=lmnr_run_id,
+								eval_id=UUID(lmnr_run_id),
 								datapoint_id=datapoint_id,
 								scores={
 									'accuracy': evaluation['score'],
@@ -1373,12 +1555,14 @@ async def run_task_with_semaphore(
 						task_result.stage_failed(Stage.EVALUATE, error)
 						logger.error(f'Task {task.task_id}: Evaluation failed: {str(e)}')
 
-				# Stage 6: Save to server (always attempt)
+				# Stage 5: Save to server (always attempt)
 				try:
 					logger.info(f'Task {task.task_id}: Saving result to server.')
 					await run_stage(
 						Stage.SAVE_SERVER,
-						lambda: asyncio.to_thread(save_result_to_server, convex_url, secret_key, task_result.server_payload),
+						lambda: asyncio.to_thread(
+							save_result_to_server, convex_url, secret_key, task_result.server_payload if task_result else {}
+						),
 						timeout=60,
 					)
 					task_result.stage_completed(Stage.SAVE_SERVER)
@@ -1400,7 +1584,9 @@ async def run_task_with_semaphore(
 					logger.info(f'Task {task.task_id}: Attempting server save after timeout.')
 					await run_stage(
 						Stage.SAVE_SERVER,
-						lambda: asyncio.to_thread(save_result_to_server, convex_url, secret_key, task_result.server_payload),
+						lambda: asyncio.to_thread(
+							save_result_to_server, convex_url, secret_key, task_result.server_payload if task_result else {}
+						),
 						timeout=30,  # Shorter timeout for emergency save
 					)
 					task_result.stage_completed(Stage.SAVE_SERVER)
@@ -1417,7 +1603,9 @@ async def run_task_with_semaphore(
 					logger.info(f'Task {task.task_id}: Attempting server save after cancellation.')
 					await run_stage(
 						Stage.SAVE_SERVER,
-						lambda: asyncio.to_thread(save_result_to_server, convex_url, secret_key, task_result.server_payload),
+						lambda: asyncio.to_thread(
+							save_result_to_server, convex_url, secret_key, task_result.server_payload if task_result else {}
+						),
 						timeout=30,  # Shorter timeout for emergency save
 					)
 					task_result.stage_completed(Stage.SAVE_SERVER)
@@ -1434,7 +1622,9 @@ async def run_task_with_semaphore(
 					logger.info(f'Task {task.task_id}: Attempting server save after critical error.')
 					await run_stage(
 						Stage.SAVE_SERVER,
-						lambda: asyncio.to_thread(save_result_to_server, convex_url, secret_key, task_result.server_payload),
+						lambda: asyncio.to_thread(
+							save_result_to_server, convex_url, secret_key, task_result.server_payload if task_result else {}
+						),
 						timeout=30,  # Shorter timeout for emergency save
 					)
 					task_result.stage_completed(Stage.SAVE_SERVER)
@@ -1464,7 +1654,9 @@ async def run_task_with_semaphore(
 			# Try emergency server save
 			try:
 				logger.info(f'Task {task.task_id}: Attempting emergency server save after initialization error.')
-				await asyncio.to_thread(save_result_to_server, convex_url, secret_key, task_result.server_payload)
+				await asyncio.to_thread(
+					save_result_to_server, convex_url, secret_key, task_result.server_payload if task_result else {}
+				)
 			except Exception as save_e:
 				logger.error(f'Task {task.task_id}: Emergency server save after initialization error failed: {str(save_e)}')
 
@@ -1477,20 +1669,34 @@ async def run_task_with_semaphore(
 			else:
 				logger.info(f'Task {task.task_id}: No browser to cleanup')
 
-		logger.info(f'Task {task.task_id}: About to release semaphore (remaining slots: ~{semaphore_runs._value})')
-		return (
+		task_end_time = time.time()
+		total_task_time = task_end_time - task_start_time
+		semaphore_hold_time = task_end_time - (semaphore_acquired_time or task_start_time)
+
+		logger.info(
+			f'🏁 Task {task.task_id}: Completed in {total_task_time:.2f}s (semaphore held for {semaphore_hold_time:.2f}s)'
+		)
+		logger.info(f'📊 Task {task.task_id}: About to release semaphore (remaining slots will be: ~{semaphore_runs._value + 1})')
+		log_system_resources(f'TASK_END_{task.task_id}')
+
+		final_result = (
 			task_result.get_local_status()
 			if task_result
 			else {'task_id': task.task_id, 'success': False, 'error': 'Task result not available'}
 		)
 
+		logger.info(
+			f'🎯 Task {task.task_id}: Final status - Success: {final_result.get("success", False)}, Error: {final_result.get("error", "None")}'
+		)
+		return final_result
+
 
 async def run_multiple_tasks(
 	tasks: list[Task],
 	llm: BaseChatModel,
 	run_id: str,
-	lmnr_run_id: str,
-	laminar_eval_link: str,
+	lmnr_run_id: str | None,
+	laminar_eval_link: str | None,
 	convex_url: str,
 	secret_key: str,
 	eval_model: BaseChatModel,
@@ -1500,7 +1706,6 @@ async def run_multiple_tasks(
 	end_index: int | None = None,
 	headless: bool = False,
 	use_vision: bool = True,
-	fresh_start: bool = True,
 	use_serp: bool = False,
 	enable_memory: bool = False,
 	memory_interval: int = 10,
@@ -1509,46 +1714,117 @@ async def run_multiple_tasks(
 	planner_llm: BaseChatModel | None = None,
 	planner_interval: int = 1,
 	include_result: bool = False,
+	highlight_elements: bool = True,
+	use_mind2web_judge: bool = False,
 ) -> dict:
 	"""
 	Run multiple tasks in parallel and evaluate results.
 	"""
-	logger.info(f'Creating semaphore with max_parallel_runs={max_parallel_runs}')
+	batch_start_time = time.time()
+	logger.info(f'🚀 BATCH START: Creating semaphore with max_parallel_runs={max_parallel_runs}')
+	log_system_resources('BATCH_START')
+
 	semaphore_runs = asyncio.Semaphore(max_parallel_runs)
 	tasks_to_run = tasks[start_index:end_index] if end_index else tasks[start_index:]
 
-	logger.info(f'Starting {len(tasks_to_run)} tasks with parallel limit of {max_parallel_runs}')
+	logger.info(f'📊 Starting {len(tasks_to_run)} tasks with parallel limit of {max_parallel_runs}')
+	logger.info(f'📋 Task range: {start_index} to {end_index or len(tasks)} (total tasks available: {len(tasks)})')
 
-	# Run all tasks in parallel with additional parameters
-	task_results = await asyncio.gather(
-		*(
-			run_task_with_semaphore(
-				task=task,
-				run_id=run_id,
-				lmnr_run_id=lmnr_run_id,
-				laminar_eval_link=laminar_eval_link,
-				convex_url=convex_url,
-				secret_key=secret_key,
-				eval_model=eval_model,
-				llm=llm,  # Pass the agent LLM
-				max_steps_per_task=max_steps_per_task,
-				headless=headless,
-				use_vision=use_vision,
-				semaphore_runs=semaphore_runs,  # Pass the semaphore
-				fresh_start=fresh_start,
-				use_serp=use_serp,
-				enable_memory=enable_memory,
-				memory_interval=memory_interval,
-				max_actions_per_step=max_actions_per_step,
-				validate_output=validate_output,
-				planner_llm=planner_llm,
-				planner_interval=planner_interval,
-				include_result=include_result,
-			)
-			for task in tasks_to_run
-		),
-		return_exceptions=True,  # Prevent task cancellation cascade
-	)
+	# Start resource monitoring
+	await start_resource_monitoring(interval=30)
+
+	# Setup signal handlers for graceful shutdown
+	setup_signal_handlers()
+
+	# Create a heartbeat task for long-running operations
+	heartbeat_task = None
+	heartbeat_stop_event = asyncio.Event()
+
+	async def heartbeat_logger():
+		"""Log periodic heartbeat to show the process is alive"""
+		heartbeat_count = 0
+		while not heartbeat_stop_event.is_set():
+			try:
+				await asyncio.wait_for(heartbeat_stop_event.wait(), timeout=60.0)  # 1-minute heartbeat
+				break  # Event was set, exit
+			except TimeoutError:
+				heartbeat_count += 1
+				elapsed = time.time() - batch_start_time
+				logger.info(f'💓 HEARTBEAT {heartbeat_count}: Batch still running after {elapsed:.1f}s')
+				log_system_resources('HEARTBEAT')
+
+				# Check for potential issues
+				resources = get_system_resources()
+				if resources['memory_percent'] > 90:
+					logger.critical(f'🚨 CRITICAL: Memory usage at {resources["memory_percent"]:.1f}% - potential OOM risk!')
+				if resources['chrome_process_count'] > 50:
+					logger.warning(f'⚠️ HIGH BROWSER PROCESS COUNT: {resources["chrome_process_count"]} Chrome processes')
+
+	try:
+		# Start heartbeat logging
+		heartbeat_task = asyncio.create_task(heartbeat_logger())
+		logger.info('💓 Heartbeat monitoring started')
+
+		# Run all tasks in parallel with additional parameters
+		logger.info(f'🚀 Launching {len(tasks_to_run)} parallel task executions...')
+
+		task_results = await asyncio.gather(
+			*(
+				run_task_with_semaphore(
+					task=task,
+					run_id=run_id,
+					lmnr_run_id=lmnr_run_id,
+					laminar_eval_link=laminar_eval_link,
+					convex_url=convex_url,
+					secret_key=secret_key,
+					eval_model=eval_model,
+					llm=llm,  # Pass the agent LLM
+					max_steps_per_task=max_steps_per_task,
+					headless=headless,
+					use_vision=use_vision,
+					semaphore_runs=semaphore_runs,  # Pass the semaphore
+					use_serp=use_serp,
+					enable_memory=enable_memory,
+					memory_interval=memory_interval,
+					max_actions_per_step=max_actions_per_step,
+					validate_output=validate_output,
+					planner_llm=planner_llm,
+					planner_interval=planner_interval,
+					include_result=include_result,
+					highlight_elements=highlight_elements,
+					use_mind2web_judge=use_mind2web_judge,
+				)
+				for task in tasks_to_run
+			),
+			return_exceptions=True,  # Prevent task cancellation cascade
+		)
+
+		logger.info(f'✅ All {len(tasks_to_run)} parallel task executions completed')
+
+	except Exception as e:
+		logger.critical(f'🚨 CRITICAL ERROR in batch execution: {type(e).__name__}: {e}', exc_info=True)
+		log_system_resources('BATCH_ERROR')
+		# Create error results for all tasks
+		task_results = [
+			{'task_id': task.task_id, 'success': False, 'error': f'Batch execution failed: {str(e)}'} for task in tasks_to_run
+		]
+
+	finally:
+		# Cleanup: Stop heartbeat and resource monitoring
+		batch_end_time = time.time()
+		total_batch_time = batch_end_time - batch_start_time
+		logger.info(f'🏁 BATCH END: Total execution time {total_batch_time:.2f}s')
+
+		if heartbeat_task and not heartbeat_task.done():
+			heartbeat_stop_event.set()
+			try:
+				await asyncio.wait_for(heartbeat_task, timeout=5.0)
+			except TimeoutError:
+				logger.warning('Heartbeat task did not stop gracefully')
+				heartbeat_task.cancel()
+
+		await stop_resource_monitoring()
+		log_system_resources('BATCH_CLEANUP')
 
 	# Process task results and handle any exceptions returned by gather
 	processed_results = []
@@ -1557,28 +1833,23 @@ async def run_multiple_tasks(
 
 	for i, result in enumerate(task_results):
 		if isinstance(result, Exception):
-			logger.error(f'Task {i} failed with exception: {type(result).__name__}: {result}')
-			processed_results.append({'task_id': f'task_{i}', 'success': False, 'error': str(result)})
+			logger.error(f'❌ Task {i} failed with exception: {type(result).__name__}: {result}')
+			task_id = tasks_to_run[i].task_id if i < len(tasks_to_run) else f'unknown_task_{i}'
+			processed_results.append({'task_id': task_id, 'success': False, 'error': str(result)})
 			failed_tasks += 1
 		else:
 			processed_results.append(result)
-			if result.get('success', False):
+			if isinstance(result, dict) and result.get('success', False):
 				successful_tasks += 1
 			else:
 				failed_tasks += 1
 
-	logger.info(f'All {len(tasks_to_run)} tasks completed. Success: {successful_tasks}, Failed: {failed_tasks}')
+	logger.info(f'📊 FINAL RESULTS: {len(tasks_to_run)} tasks completed. Success: {successful_tasks}, Failed: {failed_tasks}')
+	logger.info(f'📈 Success rate: {successful_tasks / len(tasks_to_run) * 100:.1f}%')
 
-	# After all tasks are complete, calculate a local summary
-	logger.info('All tasks completed. Calculating result summary...')
-	summary = calculate_local_summary()
+	logger.info('📋 All tasks completed.')
 
-	# Log the summary statistics
-	logger.info(f'Completed {summary["total_tasks"]} tasks')
-	logger.info(f'Success rate: {summary["success_rate"]:.2%}')
-	logger.info(f'Average score: {summary["average_score"]:.2f}')
-
-	return {'task_results': processed_results, 'summary': summary}
+	return {'task_results': processed_results}
 
 
 # Helper function to fetch tasks from the server
@@ -1662,7 +1933,7 @@ def get_git_info():
 
 
 # Helper function to start a new run on the server
-def start_new_run(convex_url: str, secret_key: str, run_details: dict, existing_run_id: str = None):
+def start_new_run(convex_url: str, secret_key: str, run_details: dict, existing_run_id: str | None = None):
 	"""Sends a request to start a new evaluation run and returns the run ID."""
 	if not convex_url or not secret_key:
 		logger.error('Error: Convex URL or Secret Key not provided for starting run.')
@@ -1779,7 +2050,6 @@ async def run_evaluation_pipeline(
 	end_index: int | None = None,
 	headless: bool = False,
 	use_vision: bool = True,
-	fresh_start: bool = True,
 	use_serp: bool = False,
 	enable_memory: bool = False,
 	memory_interval: int = 10,
@@ -1788,30 +2058,31 @@ async def run_evaluation_pipeline(
 	planner_llm: BaseChatModel | None = None,
 	planner_interval: int = 1,
 	include_result: bool = False,
+	laminar_eval_id: str | None = None,
+	highlight_elements: bool = True,
+	use_mind2web_judge: bool = False,
 ) -> dict:
 	"""
 	Complete evaluation pipeline that handles Laminar setup and task execution in the same event loop
 	"""
-	# --- Create Laminar Evaluation ---
-	logger.info('Creating Laminar evaluation...')
+	# --- Use provided Laminar Evaluation ID or skip tracking ---
 	lmnr_run_id = None
 	laminar_eval_link = None
-	try:
-		lmnr_run_id = await laminar_client.evals.create_evaluation(
-			group_name=test_case,  # Dataset name
-			name=user_message if user_message else f'{test_case} Evaluation',  # Eval name (dev message)
-		)
+
+	if laminar_eval_id:
+		# Use existing evaluation ID provided from frontend
+		lmnr_run_id = laminar_eval_id
 		project_id = 'f07da4a9-b7de-488a-91e3-e17c5f6d676a'
 		laminar_eval_link = f'https://www.lmnr.ai/project/{project_id}/evaluations/{lmnr_run_id}'
-		logger.info(f'📊 Laminar evaluation created: {laminar_eval_link}')
-
-	except Exception as e:
-		logger.error(f'Failed to create Laminar evaluation: {type(e).__name__}: {e}')
-		logger.warning('⚠️ Continuing without Laminar evaluation tracking...')
+		logger.info(f'📊 Using provided Laminar evaluation ID: {lmnr_run_id}')
+		logger.info(f'📊 Laminar evaluation link: {laminar_eval_link}')
+	else:
+		# No Laminar evaluation ID provided, skip tracking
+		logger.info('📊 No Laminar evaluation ID provided, skipping Laminar tracking')
 	# -------------------------
 
 	# Update run data with Laminar link
-	run_data_update = {'laminarEvalLink': laminar_eval_link}
+	# run_data_update = {'laminarEvalLink': laminar_eval_link}
 	# TODO: Update the run data on the server with the Laminar link if needed
 
 	# Run the tasks
@@ -1830,7 +2101,6 @@ async def run_evaluation_pipeline(
 		end_index=end_index,
 		headless=headless,
 		use_vision=use_vision,
-		fresh_start=fresh_start,
 		use_serp=use_serp,
 		enable_memory=enable_memory,
 		memory_interval=memory_interval,
@@ -1839,6 +2109,8 @@ async def run_evaluation_pipeline(
 		planner_llm=planner_llm,
 		planner_interval=planner_interval,
 		include_result=include_result,
+		highlight_elements=highlight_elements,
+		use_mind2web_judge=use_mind2web_judge,
 	)
 
 
@@ -1849,7 +2121,7 @@ if __name__ == '__main__':
 	parser.add_argument('--start', type=int, default=0, help='Start index')
 	parser.add_argument('--end', type=int, default=None, help='End index (exclusive)')
 	parser.add_argument('--headless', action='store_true', help='Run in headless mode')
-	parser.add_argument('--evaluate-only', action='store_true', help='Only evaluate existing results without running new tasks')
+
 	parser.add_argument(
 		'--model', type=str, default='gpt-4o', choices=list(SUPPORTED_MODELS.keys()), help='Model to use for the agent'
 	)
@@ -1857,12 +2129,7 @@ if __name__ == '__main__':
 		'--eval-model', type=str, default='gpt-4o', choices=list(SUPPORTED_MODELS.keys()), help='Model to use for evaluation'
 	)
 	parser.add_argument('--no-vision', action='store_true', help='Disable vision capabilities in the agent')
-	parser.add_argument(
-		'--fresh-start',
-		type=lambda x: (str(x).lower() == 'true'),
-		default=True,
-		help='Clear saved_trajectories before starting. Set to False to keep existing trajectories (default: True)',
-	)
+
 	parser.add_argument('--user-message', type=str, default='', help='User message to include in the run')
 	parser.add_argument('--eval-group', type=str, default='', help='Evaluation group to include in the run')
 	parser.add_argument('--developer-id', type=str, default=None, help='Name of the developer starting the run')
@@ -1893,201 +2160,178 @@ if __name__ == '__main__':
 		action='store_true',
 		help='Include result flag (functionality to be implemented)',
 	)
+	parser.add_argument(
+		'--no-highlight-elements',
+		action='store_false',
+		dest='highlight_elements',
+		default=True,
+		help='Disable highlighting of interactive elements on the page (highlighting is enabled by default)',
+	)
+	parser.add_argument(
+		'--laminar-eval-id',
+		type=str,
+		default=None,
+		help='Existing Laminar evaluation ID to use (if not provided, a new evaluation will be created)',
+	)
+	parser.add_argument('--use-mind2web-judge', action='store_true', help='Use original judge')
+
 	args = parser.parse_args()
 
 	# Set up logging - Make sure logger is configured before use in fetch function
 	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 	logger = logging.getLogger(__name__)  # Define logger for the module
 
-	if args.evaluate_only:
-		# Just evaluate existing results
-		logger.info('Evaluating existing results...')
-		summary = calculate_local_summary()
+	logger.info('Running tasks...')
+	# Run tasks and evaluate
+	load_dotenv()
 
-		# Save evaluation results
-		timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-		eval_file = f'saved_trajectories/evaluation_summary_{timestamp}.json'
-		with open(eval_file, 'w') as f:
-			json.dump(summary, f, indent=2)
+	# --- Fetch Tasks from Server ---
+	CONVEX_URL = os.getenv('EVALUATION_TOOL_URL')
+	SECRET_KEY = os.getenv('EVALUATION_TOOL_SECRET_KEY')
 
-		logger.info(f'Evaluation complete. Success rate: {summary["success_rate"]:.2%}')
-		logger.info(f'Average score: {summary["average_score"]:.2f}')
-		logger.info(f'Full results saved to {eval_file}')
+	if not CONVEX_URL or not SECRET_KEY:
+		logger.error('Error: EVALUATION_TOOL_URL or EVALUATION_TOOL_SECRET_KEY environment variables not set.')
+		exit(1)  # Exit if config is missing
 
+	logger.info(f"Attempting to fetch task list '{args.test_case}' from server...")
+	fetched_task_data = fetch_tasks_from_server(CONVEX_URL, SECRET_KEY, args.test_case)
+
+	if fetched_task_data is None:
+		logger.error('Failed to fetch tasks from the server. Exiting.')
+		exit(1)  # Exit if fetch fails
+
+	try:
+		tasks = [Task(**task_data) for task_data in fetched_task_data]
+		logger.info(f'Successfully loaded {len(tasks)} tasks from the server.')
+	except (TypeError, ValueError) as e:
+		logger.error(
+			f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}'
+		)
+		logger.error(f'First item in fetched data: {fetched_task_data[0] if fetched_task_data else "None"}')
+		exit(1)
+	# -----------------------------
+
+	# --- Start Run on Server (with optional existing Run ID) ---
+	if args.run_id:
+		logger.info(f'Initializing existing run ID: {args.run_id} with git info...')
 	else:
-		logger.info('Running tasks...')
-		# Run tasks and evaluate
-		load_dotenv()
+		logger.info('Attempting to start a new run on the server...')
 
-		# --- Clear trajectories if fresh_start is True ---
-		results_dir_path = Path('saved_trajectories')
-		if args.fresh_start:
-			logger.info(f'--fresh-start is True. Clearing {results_dir_path}...')
-			if results_dir_path.exists():
-				try:
-					shutil.rmtree(results_dir_path)
-					logger.info(f'Successfully removed {results_dir_path}.')
-				except OSError as e:
-					logger.error(f'Error removing directory {results_dir_path}: {type(e).__name__}: {e}')
-					# Decide if you want to exit or continue
-					# exit(1) # Uncomment to exit on error
-			else:
-				logger.info(f'{results_dir_path} does not exist, no need to clear.')
+	git_info = get_git_info()
 
-			# Recreate the directory
-			try:
-				results_dir_path.mkdir(parents=True, exist_ok=True)
-				logger.info(f'Recreated directory {results_dir_path}.')
-			except OSError as e:
-				logger.error(f'Error creating directory {results_dir_path}: {type(e).__name__}: {e}')
-				# exit(1) # Uncomment to exit on error
+	# Collect additional data from args to store with the run
+	additional_run_data = {
+		'max_steps': args.max_steps,
+		'parallel_runs': args.parallel_runs,
+		'start_index': args.start,
+		'end_index': args.end,
+		'headless': args.headless,
+		'use_vision': not args.no_vision,
+		'task_source': args.test_case,
+		'llm_judge': args.eval_model,
+		'use_serp': args.use_serp,
+		'enable_memory': args.enable_memory,
+		'memory_interval': args.memory_interval,
+		'max_actions_per_step': args.max_actions_per_step,
+		'validate_output': args.validate_output,
+		'planner_model': args.planner_model,
+		'planner_interval': args.planner_interval,
+		'include_result': args.include_result,
+	}
+
+	run_data = {
+		'model': args.model,
+		'gitBranch': git_info['branch'],
+		'gitCommitHash': git_info['hash'],
+		'gitCommitTimestamp': git_info['timestamp'],
+		'gitRepo': git_info['repo'],
+		'userMessage': args.user_message,
+		'evalGroup': args.eval_group,
+		'developerId': args.developer_id,
+		'totalTasks': len(tasks) - args.start if args.end is None else args.end - args.start,
+		'testCaseName': args.test_case,
+		'additionalData': additional_run_data,
+		'laminarEvalLink': None,  # Will be updated after evaluation creation
+	}
+
+	run_id = start_new_run(CONVEX_URL, SECRET_KEY, run_data, existing_run_id=args.run_id)
+
+	if not run_id:
+		logger.error('Failed to start/initialize run on the server. Exiting.')
+		exit(1)
+
+	logger.info(f'Successfully obtained run ID: {run_id}. Proceeding with tasks...')
+
+	# Log search mode being used
+	if args.use_serp:
+		if SERPER_API_KEY:
+			logger.info('🔍 Using SERP search (Serper API) instead of Google search')
 		else:
-			logger.info('--fresh-start is False. Existing trajectories in saved_trajectories will be kept.')
-		# -------------------------------------------------
+			logger.warning('⚠️ --use-serp flag provided but SERPER_API_KEY not set. Search will fail!')
+	else:
+		logger.info('🔍 Using default Google search')
 
-		# --- Fetch Tasks from Server ---
-		CONVEX_URL = os.getenv('EVALUATION_TOOL_URL')
-		SECRET_KEY = os.getenv('EVALUATION_TOOL_SECRET_KEY')
+	# Log memory configuration
+	if args.enable_memory:
+		logger.info(f'🧠 Memory enabled: mem0 system with interval={args.memory_interval} steps')
+	else:
+		logger.info('🧠 Memory disabled')
 
-		if not CONVEX_URL or not SECRET_KEY:
-			logger.error('Error: EVALUATION_TOOL_URL or EVALUATION_TOOL_SECRET_KEY environment variables not set.')
-			exit(1)  # Exit if config is missing
+	# Log other agent configuration
+	logger.info(f'🎯 Max actions per step: {args.max_actions_per_step}')
 
-		logger.info(f"Attempting to fetch task list '{args.test_case}' from server...")
-		fetched_task_data = fetch_tasks_from_server(CONVEX_URL, SECRET_KEY, args.test_case)
+	if args.validate_output:
+		logger.info('✅ Output validation enabled')
+	else:
+		logger.info('✅ Output validation disabled')
 
-		if fetched_task_data is None:
-			logger.error('Failed to fetch tasks from the server. Exiting.')
-			exit(1)  # Exit if fetch fails
+	if args.planner_model:
+		logger.info(f'🗺️ Planner enabled: {args.planner_model} (interval={args.planner_interval} steps)')
+	else:
+		logger.info('🗺️ Planner disabled')
+	# -------------------------
 
+	# --- Get LLMs ---
+	logger.info(f'Instantiating agent LLM: {args.model}')
+	try:
+		# Get the selected LLM for the agent
+		llm = get_llm(args.model)
+		logger.info('Agent LLM instantiated successfully.')
+	except Exception as e:
+		logger.error(f'Failed to instantiate agent LLM ({args.model}): {type(e).__name__}: {e}', exc_info=True)
+		exit(1)
+
+	logger.info(f'Instantiating evaluation LLM: {args.eval_model}')
+	try:
+		eval_model = get_llm(args.eval_model)
+		logger.info(f'Evaluation LLM ({args.eval_model}) instantiated successfully.')
+	except Exception as e:
+		logger.error(
+			f'Failed to instantiate evaluation LLM ({args.eval_model}): {type(e).__name__}: {e}. Make sure required API keys are set.',
+			exc_info=True,
+		)
+		exit(1)
+
+	# Get planner LLM if specified
+	planner_llm = None
+	if args.planner_model:
+		logger.info(f'Instantiating planner LLM: {args.planner_model}')
 		try:
-			tasks = [Task(**task_data) for task_data in fetched_task_data]
-			logger.info(f'Successfully loaded {len(tasks)} tasks from the server.')
-		except (TypeError, ValueError) as e:
-			logger.error(
-				f'Error creating Task objects from fetched data. Ensure the data structure includes required fields (task_id, confirmed_task). Known optional fields: website, reference_length, level, cluster_id, login_cookie, login_type, category. Any additional fields will be accepted dynamically. Error: {type(e).__name__}: {e}'
-			)
-			logger.error(f'First item in fetched data: {fetched_task_data[0] if fetched_task_data else "None"}')
-			exit(1)
-		# -----------------------------
-
-		# --- Start Run on Server (with optional existing Run ID) ---
-		if args.run_id:
-			logger.info(f'Initializing existing run ID: {args.run_id} with git info...')
-		else:
-			logger.info('Attempting to start a new run on the server...')
-
-		git_info = get_git_info()
-
-		# Collect additional data from args to store with the run
-		additional_run_data = {
-			'max_steps': args.max_steps,
-			'parallel_runs': args.parallel_runs,
-			'start_index': args.start,
-			'end_index': args.end,
-			'headless': args.headless,
-			'use_vision': not args.no_vision,
-			'task_source': args.test_case,
-			'llm_judge': args.eval_model,
-			'fresh_start': args.fresh_start,
-			'use_serp': args.use_serp,
-			'enable_memory': args.enable_memory,
-			'memory_interval': args.memory_interval,
-			'max_actions_per_step': args.max_actions_per_step,
-			'validate_output': args.validate_output,
-			'planner_model': args.planner_model,
-			'planner_interval': args.planner_interval,
-			'include_result': args.include_result,
-		}
-
-		run_data = {
-			'model': args.model,
-			'gitBranch': git_info['branch'],
-			'gitCommitHash': git_info['hash'],
-			'gitCommitTimestamp': git_info['timestamp'],
-			'gitRepo': git_info['repo'],
-			'userMessage': args.user_message,
-			'evalGroup': args.eval_group,
-			'developerId': args.developer_id,
-			'totalTasks': len(tasks) - args.start if args.end is None else args.end - args.start,
-			'testCaseName': args.test_case,
-			'additionalData': additional_run_data,
-			'laminarEvalLink': None,  # Will be updated after evaluation creation
-		}
-
-		run_id = start_new_run(CONVEX_URL, SECRET_KEY, run_data, existing_run_id=args.run_id)
-
-		if not run_id:
-			logger.error('Failed to start/initialize run on the server. Exiting.')
-			exit(1)
-
-		logger.info(f'Successfully obtained run ID: {run_id}. Proceeding with tasks...')
-
-		# Log search mode being used
-		if args.use_serp:
-			if SERPER_API_KEY:
-				logger.info('🔍 Using SERP search (Serper API) instead of Google search')
-			else:
-				logger.warning('⚠️ --use-serp flag provided but SERPER_API_KEY not set. Search will fail!')
-		else:
-			logger.info('🔍 Using default Google search')
-
-		# Log memory configuration
-		if args.enable_memory:
-			logger.info(f'🧠 Memory enabled: mem0 system with interval={args.memory_interval} steps')
-		else:
-			logger.info('🧠 Memory disabled')
-
-		# Log other agent configuration
-		logger.info(f'🎯 Max actions per step: {args.max_actions_per_step}')
-
-		if args.validate_output:
-			logger.info('✅ Output validation enabled')
-		else:
-			logger.info('✅ Output validation disabled')
-
-		if args.planner_model:
-			logger.info(f'🗺️ Planner enabled: {args.planner_model} (interval={args.planner_interval} steps)')
-		else:
-			logger.info('🗺️ Planner disabled')
-		# -------------------------
-
-		# --- Get LLMs ---
-		logger.info(f'Instantiating agent LLM: {args.model}')
-		try:
-			# Get the selected LLM for the agent
-			llm = get_llm(args.model)
-			logger.info('Agent LLM instantiated successfully.')
-		except Exception as e:
-			logger.error(f'Failed to instantiate agent LLM ({args.model}): {type(e).__name__}: {e}', exc_info=True)
-			exit(1)
-
-		logger.info(f'Instantiating evaluation LLM: {args.eval_model}')
-		try:
-			eval_model = get_llm(args.eval_model)
-			logger.info(f'Evaluation LLM ({args.eval_model}) instantiated successfully.')
+			planner_llm = get_llm(args.planner_model)
+			logger.info(f'Planner LLM ({args.planner_model}) instantiated successfully.')
 		except Exception as e:
 			logger.error(
-				f'Failed to instantiate evaluation LLM ({args.eval_model}): {type(e).__name__}: {e}. Make sure required API keys are set.',
+				f'Failed to instantiate planner LLM ({args.planner_model}): {type(e).__name__}: {e}. Make sure required API keys are set.',
 				exc_info=True,
 			)
 			exit(1)
+	# -----------------
 
-		# Get planner LLM if specified
-		planner_llm = None
-		if args.planner_model:
-			logger.info(f'Instantiating planner LLM: {args.planner_model}')
-			try:
-				planner_llm = get_llm(args.planner_model)
-				logger.info(f'Planner LLM ({args.planner_model}) instantiated successfully.')
-			except Exception as e:
-				logger.error(
-					f'Failed to instantiate planner LLM ({args.planner_model}): {type(e).__name__}: {e}. Make sure required API keys are set.',
-					exc_info=True,
-				)
-				exit(1)
-		# -----------------
+	# Log initial system state
+	logger.info('🔧 EVALUATION STARTUP')
+	log_system_resources('STARTUP')
 
+	try:
 		results = asyncio.run(
 			run_evaluation_pipeline(
 				tasks=tasks,
@@ -2104,7 +2348,6 @@ if __name__ == '__main__':
 				end_index=args.end,
 				headless=args.headless,
 				use_vision=not args.no_vision,
-				fresh_start=args.fresh_start,
 				use_serp=args.use_serp,
 				enable_memory=args.enable_memory,
 				memory_interval=args.memory_interval,
@@ -2113,23 +2356,22 @@ if __name__ == '__main__':
 				planner_llm=planner_llm,
 				planner_interval=args.planner_interval,
 				include_result=args.include_result,
+				laminar_eval_id=args.laminar_eval_id,
+				highlight_elements=args.highlight_elements,
+				use_mind2web_judge=args.use_mind2web_judge,
 			)
 		)
 
-		logger.info('Task completed. Saving results...')
-		# Save results
-		timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-		results_file = f'saved_trajectories/eval_results_{timestamp}.json'
+		logger.info('✅ EVALUATION COMPLETED SUCCESSFULLY')
+		log_system_resources('SUCCESS_COMPLETION')
 
-		# Convert results to JSON-serializable format
-		serializable_results = {'summary': results['summary']}
+	except KeyboardInterrupt:
+		logger.warning('⚠️ EVALUATION INTERRUPTED by user (Ctrl+C)')
+		log_system_resources('INTERRUPTED')
+		raise
+	except Exception as e:
+		logger.critical(f'🚨 EVALUATION FAILED: {type(e).__name__}: {e}', exc_info=True)
+		log_system_resources('FAILED_COMPLETION')
+		raise
 
-		with open(results_file, 'w') as f:
-			json.dump(serializable_results, f, indent=2)
-
-		# Print summary
-		summary = results['summary']
-		logger.info(f'Completed {summary["total_tasks"]} tasks.')
-		logger.info(f'Success rate: {summary["success_rate"]:.2%}')
-		logger.info(f'Average score: {summary["average_score"]:.2f}')
-		logger.info(f'Results saved to {results_file}')
+	logger.info('✅ All tasks completed successfully.')
diff --git a/examples/browser/multiple_agents_same_browser.py b/examples/browser/multiple_agents_same_browser.py
index eaec0454b..9cea98329 100644
--- a/examples/browser/multiple_agents_same_browser.py
+++ b/examples/browser/multiple_agents_same_browser.py
@@ -12,14 +12,17 @@ load_dotenv()
 from langchain_openai import ChatOpenAI
 
 from browser_use import Agent
+from browser_use.browser.profile import BrowserProfile
 from browser_use.browser.session import BrowserSession
 
 
 async def main():
 	browser_session = BrowserSession(
-		keep_alive=True,
-		user_data_dir=None,
-		headless=False,
+		browser_profile=BrowserProfile(
+			keep_alive=True,
+			user_data_dir=None,
+			headless=False,
+		)
 	)
 	await browser_session.start()
 
diff --git a/examples/browser/stealth.py b/examples/browser/stealth.py
index a33fe8221..7b57a5186 100644
--- a/examples/browser/stealth.py
+++ b/examples/browser/stealth.py
@@ -1,3 +1,4 @@
+# pyright: reportMissingImports=false
 import asyncio
 import os
 import shutil
@@ -14,6 +15,7 @@ from imgcat import imgcat
 from langchain_openai import ChatOpenAI
 
 from browser_use.browser import BrowserSession
+from browser_use.browser.profile import BrowserProfile
 from browser_use.browser.types import async_patchright
 
 llm = ChatOpenAI(model='gpt-4o')
@@ -28,11 +30,13 @@ async def main():
 	# Default Playwright Chromium Browser
 	normal_browser_session = BrowserSession(
 		# executable_path=<defaults to playwright builtin browser stored in ms-cache directory>,
-		user_data_dir=None,
-		headless=False,
-		stealth=False,
-		# deterministic_rendering=False,
-		# disable_security=False,
+		browser_profile=BrowserProfile(
+			user_data_dir=None,
+			headless=False,
+			stealth=False,
+			# deterministic_rendering=False,
+			# disable_security=False,
+		)
 	)
 	await normal_browser_session.start()
 	await normal_browser_session.create_new_tab('https://abrahamjuliot.github.io/creepjs/')
@@ -45,11 +49,13 @@ async def main():
 	patchright_browser_session = BrowserSession(
 		# cdp_url='wss://browser.zenrows.com?apikey=your-api-key-here&proxy_region=na',
 		#                or try anchor browser, browserless, steel.dev, browserbase, oxylabs, brightdata, etc.
-		user_data_dir='~/.config/browseruse/profiles/stealth',
-		stealth=True,
-		headless=False,
-		disable_security=False,
-		deterministic_rendering=False,
+		browser_profile=BrowserProfile(
+			user_data_dir='~/.config/browseruse/profiles/stealth',
+			stealth=True,
+			headless=False,
+			disable_security=False,
+			deterministic_rendering=False,
+		)
 	)
 	await patchright_browser_session.start()
 	await patchright_browser_session.create_new_tab('https://abrahamjuliot.github.io/creepjs/')
@@ -62,11 +68,13 @@ async def main():
 	if Path('/Applications/Brave Browser.app/Contents/MacOS/Brave Browser').is_file():
 		print('\n\nBRAVE BROWSER:')
 		brave_browser_session = BrowserSession(
-			executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
-			headless=False,
-			disable_security=False,
-			user_data_dir='~/.config/browseruse/profiles/brave',
-			deterministic_rendering=False,
+			browser_profile=BrowserProfile(
+				executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
+				headless=False,
+				disable_security=False,
+				user_data_dir='~/.config/browseruse/profiles/brave',
+				deterministic_rendering=False,
+			)
 		)
 		await brave_browser_session.start()
 		await brave_browser_session.create_new_tab('https://abrahamjuliot.github.io/creepjs/')
@@ -78,12 +86,14 @@ async def main():
 	if Path('/Applications/Brave Browser.app/Contents/MacOS/Brave Browser').is_file():
 		print('\n\nBRAVE + PATCHRIGHT STEALTH BROWSER:')
 		brave_patchright_browser_session = BrowserSession(
-			executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
 			playwright=patchright,
-			headless=False,
-			disable_security=False,
-			user_data_dir=None,
-			deterministic_rendering=False,
+			browser_profile=BrowserProfile(
+				executable_path='/Applications/Brave Browser.app/Contents/MacOS/Brave Browser',
+				headless=False,
+				disable_security=False,
+				user_data_dir=None,
+				deterministic_rendering=False,
+			),
 			# **patchright.devices['iPhone 13'],  # emulate other devices: https://playwright.dev/python/docs/emulation
 		)
 		await brave_patchright_browser_session.start()
diff --git a/examples/browser/using_cdp.py b/examples/browser/using_cdp.py
index f6e72cf86..6168f4204 100644
--- a/examples/browser/using_cdp.py
+++ b/examples/browser/using_cdp.py
@@ -25,14 +25,16 @@ from langchain_google_genai import ChatGoogleGenerativeAI
 from pydantic import SecretStr
 
 from browser_use import Agent, Controller
-from browser_use.browser import BrowserSession
+from browser_use.browser import BrowserProfile, BrowserSession
 
 api_key = os.getenv('GOOGLE_API_KEY')
 if not api_key:
 	raise ValueError('GOOGLE_API_KEY is not set')
 
 browser_session = BrowserSession(
-	headless=False,
+	browser_profile=BrowserProfile(
+		headless=False,
+	),
 	cdp_url='http://localhost:9222',
 )
 controller = Controller()
@@ -41,6 +43,8 @@ controller = Controller()
 async def main():
 	task = 'In docs.google.com write my Papa a quick thank you for everything letter \n - Magnus'
 	task += ' and save the document as pdf'
+	# Assert api_key is not None to satisfy type checker
+	assert api_key is not None, 'GOOGLE_API_KEY must be set'
 	model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
 	agent = Agent(
 		task=task,
diff --git a/examples/browser/window_sizing.py b/examples/browser/window_sizing.py
index 518c7196a..3b48c9f5e 100644
--- a/examples/browser/window_sizing.py
+++ b/examples/browser/window_sizing.py
@@ -53,12 +53,15 @@ async def example_custom_window_size():
 		actual_content_size = await page.evaluate("""() => ({width: window.innerWidth, height: window.innerHeight})""")
 
 		if profile.viewport:
-			expected_page_size = profile.viewport
+			expected_page_size = dict(profile.viewport)
 		elif profile.window_size:
 			expected_page_size = {
 				'width': profile.window_size['width'],
 				'height': profile.window_size['height'] - 87,
 			}  # 87px is the height of the navbar, title, rim ish
+		else:
+			# Default expected size if neither viewport nor window_size is set
+			expected_page_size = {'width': 800, 'height': 600}
 		_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
 		print(f'Expected {_log_size(expected_page_size)} vs actual {_log_size(actual_content_size)}')
 
@@ -95,7 +98,10 @@ async def example_no_viewport_option():
 
 		# Get viewport size (inner dimensions)
 		viewport = await page.evaluate('() => ({width: window.innerWidth, height: window.innerHeight})')
-		print(f'Configured size: width={profile.window_size["width"]}, height={profile.window_size["height"]}')
+		if profile.window_size:
+			print(f'Configured size: width={profile.window_size["width"]}, height={profile.window_size["height"]}')
+		else:
+			print('No window size configured')
 		print(f'Actual viewport size: {viewport}')
 
 		# Get the actual window size (outer dimensions)
@@ -118,7 +124,11 @@ async def example_no_viewport_option():
 
 
 def validate_window_size(configured: dict[str, Any], actual: dict[str, Any]) -> None:
-	"""Compare configured window size with actual size and report differences"""
+	"""Compare configured window size with actual size and report differences.
+
+	Raises:
+		Exception: If the window size difference exceeds tolerance
+	"""
 	# Allow for small differences due to browser chrome, scrollbars, etc.
 	width_diff = abs(configured['width'] - actual['width'])
 	height_diff = abs(configured['height'] - actual['height'])
@@ -133,6 +143,8 @@ def validate_window_size(configured: dict[str, Any], actual: dict[str, Any]) ->
 	else:
 		print('✅ Window size validation passed: actual size matches configured size within tolerance')
 
+	return None
+
 
 async def main():
 	"""Run all window sizing examples"""
diff --git a/examples/custom-functions/2fa.py b/examples/custom-functions/2fa.py
index 766dcca4a..b5a31cd3c 100644
--- a/examples/custom-functions/2fa.py
+++ b/examples/custom-functions/2fa.py
@@ -9,7 +9,7 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-import pyotp
+import pyotp  # type: ignore
 from langchain_openai import ChatOpenAI
 
 from browser_use import ActionResult, Agent, Controller
diff --git a/examples/custom-functions/advanced_search.py b/examples/custom-functions/advanced_search.py
index d22d9c95f..cf13cdbfa 100644
--- a/examples/custom-functions/advanced_search.py
+++ b/examples/custom-functions/advanced_search.py
@@ -1,5 +1,5 @@
 import asyncio
-import http
+import http.client
 import json
 import os
 import sys
diff --git a/examples/custom-functions/custom_hooks_before_after_step.py b/examples/custom-functions/custom_hooks_before_after_step.py
index 9503c533e..98bacc80f 100644
--- a/examples/custom-functions/custom_hooks_before_after_step.py
+++ b/examples/custom-functions/custom_hooks_before_after_step.py
@@ -46,7 +46,7 @@ def b64_to_png(b64_string: str, output_file):
 import json
 from pathlib import Path
 
-import prettyprinter
+import prettyprinter  # type: ignore
 from fastapi import FastAPI, Request
 
 prettyprinter.install_extras()
@@ -124,7 +124,7 @@ load_dotenv()
 
 import requests
 from langchain_openai import ChatOpenAI
-from pyobjtojson import obj_to_json
+from pyobjtojson import obj_to_json  # type: ignore
 
 from browser_use import Agent
 
@@ -148,14 +148,13 @@ async def record_activity(agent_obj):
 	extracted_content_json_last_elem = None
 
 	print('--- ON_STEP_START HOOK ---')
-	website_html: str = await agent_obj.browser_context.get_page_html()
-	website_screenshot: str = await agent_obj.browser_context.take_screenshot()
+	website_html = await agent_obj.browser_context.get_page_html()
+	website_screenshot = await agent_obj.browser_context.take_screenshot()
 
 	print('--> History:')
-	if hasattr(agent_obj, 'state'):
-		history = agent_obj.state.history
-	else:
-		history = None
+	# Assert agent has state to satisfy type checker
+	assert hasattr(agent_obj, 'state'), 'Agent must have state attribute'
+	history = agent_obj.state.history
 
 	model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)
 
diff --git a/examples/custom-functions/extract_pdf_content.py b/examples/custom-functions/extract_pdf_content.py
new file mode 100755
index 000000000..20ab2de47
--- /dev/null
+++ b/examples/custom-functions/extract_pdf_content.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["browser-use", "mistralai"]
+# ///
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+import asyncio
+import logging
+
+from langchain_openai import ChatOpenAI
+from mistralai import Mistral  # type: ignore
+from pydantic import BaseModel, Field
+
+from browser_use import Agent, Controller
+from browser_use.agent.views import ActionResult
+from browser_use.browser.context import BrowserContext
+
+if not os.getenv('OPENAI_API_KEY'):
+	raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
+
+if not os.getenv('MISTRAL_API_KEY'):
+	raise ValueError('MISTRAL_API_KEY is not set. Please add it to your environment variables.')
+
+logger = logging.getLogger(__name__)
+
+controller = Controller()
+
+
+class PdfExtractParams(BaseModel):
+	url: str = Field(description='URL to a PDF document')
+
+
+@controller.registry.action(
+	'Extract PDF Text',
+	param_model=PdfExtractParams,
+)
+def extract_mistral_ocr(params: PdfExtractParams, browser: BrowserContext) -> ActionResult:
+	"""
+	Process a PDF URL using Mistral OCR API and return the OCR response.
+
+	Args:
+	    url: URL to a PDF document
+
+	Returns:
+	    OCR response object from Mistral API
+	"""
+	api_key = os.getenv('MISTRAL_API_KEY')
+	client = Mistral(api_key=api_key)
+
+	response = client.ocr.process(
+		model='mistral-ocr-latest',
+		document={
+			'type': 'document_url',
+			'document_url': params.url,
+		},
+		include_image_base64=False,
+	)
+
+	markdown = '\n\n'.join(f'### Page {i + 1}\n{response.pages[i].markdown}' for i in range(len(response.pages)))
+	return ActionResult(
+		extracted_content=markdown,
+		include_in_memory=False,  ## PDF content can be very large, so we don't include it in memory
+	)
+
+
+async def main():
+	agent = Agent(
+		task="""
+        Objective: Navigate to the following URL, extract its contents using the Extract PDF Text action, and explain its historical significance.
+
+        URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf
+        """,
+		llm=ChatOpenAI(model='gpt-4o'),
+		controller=controller,
+	)
+	result = await agent.run()
+	logger.info(result)
+
+
+if __name__ == '__main__':
+	asyncio.run(main())
diff --git a/examples/custom-functions/file_upload.py b/examples/custom-functions/file_upload.py
index 6f0d628ff..aee4b6b60 100644
--- a/examples/custom-functions/file_upload.py
+++ b/examples/custom-functions/file_upload.py
@@ -10,7 +10,13 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-import anyio
+from lmnr import Laminar
+
+try:
+	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
+except Exception:
+	pass
+
 from langchain_openai import ChatOpenAI
 
 from browser_use import Agent, Controller
@@ -22,9 +28,7 @@ logger = logging.getLogger(__name__)
 controller = Controller()
 
 
-@controller.action(
-	'Upload file to interactive element with file path ',
-)
+@controller.action('Upload file to interactive element with file path')
 async def upload_file(index: int, path: str, browser_session: BrowserSession, available_file_paths: list[str]):
 	if path not in available_file_paths:
 		return ActionResult(error=f'File path {path} is not available')
@@ -57,18 +61,6 @@ async def upload_file(index: int, path: str, browser_session: BrowserSession, av
 		return ActionResult(error=msg)
 
 
-@controller.action('Read the file content of a file given a path')
-async def read_file(path: str, available_file_paths: list[str]):
-	if path not in available_file_paths:
-		return ActionResult(error=f'File path {path} is not available')
-
-	async with await anyio.open_file(path, 'r') as f:
-		content = await f.read()
-	msg = f'File content: {content}'
-	logger.info(msg)
-	return ActionResult(extracted_content=msg, include_in_memory=True)
-
-
 def create_file(file_type: str = 'txt'):
 	with open(f'tmp.{file_type}', 'w') as f:
 		f.write('test')
@@ -79,11 +71,10 @@ def create_file(file_type: str = 'txt'):
 
 async def main():
 	task = 'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields'
-	task = 'Go to https://www.freepdfconvert.com/,  upload the file tmp.pdf into the field choose a file - dont click the fileupload button'
-
+	task = 'Go to https://www.freepdfconvert.com/, upload the file tmp.pdf into the field choose a file - dont click the fileupload button'
 	available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')]
 
-	model = ChatOpenAI(model='gpt-4o')
+	model = ChatOpenAI(model='gpt-4.1-mini')
 	agent = Agent(
 		task=task,
 		llm=model,
diff --git a/examples/custom-functions/notification.py b/examples/custom-functions/notification.py
index a8c6a425d..31dde9266 100644
--- a/examples/custom-functions/notification.py
+++ b/examples/custom-functions/notification.py
@@ -17,7 +17,7 @@ controller = Controller()
 
 @controller.registry.action('Done with task ')
 async def done(text: str):
-	import yagmail
+	import yagmail  # type: ignore
 
 	# To send emails use
 	# STEP 1: go to https://support.google.com/accounts/answer/185833
diff --git a/examples/custom-functions/onepassword_2fa.py b/examples/custom-functions/onepassword_2fa.py
index cc58c2cce..f98f3767a 100644
--- a/examples/custom-functions/onepassword_2fa.py
+++ b/examples/custom-functions/onepassword_2fa.py
@@ -10,7 +10,7 @@ from dotenv import load_dotenv
 load_dotenv()
 
 from langchain_openai import ChatOpenAI
-from onepassword.client import Client  # pip install onepassword-sdk
+from onepassword.client import Client  # type: ignore  # pip install onepassword-sdk
 
 from browser_use import ActionResult, Agent, Controller
 
diff --git a/examples/custom-functions/solve_amazon_captcha.py b/examples/custom-functions/solve_amazon_captcha.py
index 397b60936..7c8d3f756 100644
--- a/examples/custom-functions/solve_amazon_captcha.py
+++ b/examples/custom-functions/solve_amazon_captcha.py
@@ -8,7 +8,7 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-from amazoncaptcha import AmazonCaptcha
+from amazoncaptcha import AmazonCaptcha  # type: ignore
 from langchain_openai import ChatOpenAI
 
 from browser_use import ActionResult
diff --git a/examples/features/custom_vector_store.py b/examples/features/custom_vector_store.py
index ec42b3151..77088bbfe 100644
--- a/examples/features/custom_vector_store.py
+++ b/examples/features/custom_vector_store.py
@@ -59,19 +59,22 @@ async def run_agent_with_memory_config(
 	# Let's refine how to access summaries. The summary is added as a 'memory' type message.
 
 	summaries_created = []
-	for step_messages in agent.message_manager.state.history.get_messages():
-		if isinstance(step_messages, list):
-			for msg in step_messages:
-				if (
-					hasattr(msg, 'additional_kwargs')
-					and msg.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
-				):
-					summaries_created.append(msg.content)
-		elif (
-			hasattr(step_messages, 'additional_kwargs')
-			and step_messages.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
-		):  # if it's a list of messages
-			summaries_created.append(step_messages.content)
+	for item in agent.message_manager.state.history.get_messages():
+		# get_messages() returns tuples of (step_number, messages)
+		if isinstance(item, tuple) and len(item) == 2:
+			step_number, step_messages = item
+			if isinstance(step_messages, list):
+				for msg in step_messages:
+					if (
+						hasattr(msg, 'additional_kwargs')
+						and msg.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
+					):
+						summaries_created.append(msg.content)
+			elif (
+				hasattr(step_messages, 'additional_kwargs')
+				and step_messages.additional_kwargs.get('metadata', {}).get('message_type') == 'memory'
+			):
+				summaries_created.append(step_messages.content)
 
 	if summaries_created:
 		print('\nProcedural Summaries Created during run:')
@@ -169,5 +172,9 @@ if __name__ == '__main__':
 	import sys
 
 	if sys.platform.startswith('win'):
-		asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+		# WindowsProactorEventLoopPolicy is only available on Windows
+		try:
+			asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())  # type: ignore
+		except AttributeError:
+			pass  # Not on Windows, ignore
 	asyncio.run(main())
diff --git a/examples/features/download_file.py b/examples/features/download_file.py
index 912c55486..1c2b03c86 100644
--- a/examples/features/download_file.py
+++ b/examples/features/download_file.py
@@ -18,11 +18,16 @@ api_key = os.getenv('GOOGLE_API_KEY')
 if not api_key:
 	raise ValueError('GOOGLE_API_KEY is not set')
 
+assert api_key is not None, 'GOOGLE_API_KEY must be set'
 llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
 
+from browser_use.browser import BrowserProfile
+
 browser_session = BrowserSession(
-	downloads_path='~/Downloads',
-	user_data_dir='~/.config/browseruse/profiles/default',
+	browser_profile=BrowserProfile(
+		downloads_path='~/Downloads',
+		user_data_dir='~/.config/browseruse/profiles/default',
+	)
 )
 
 
diff --git a/examples/features/parallel_agents.py b/examples/features/parallel_agents.py
index 17b379465..0912a30fa 100644
--- a/examples/features/parallel_agents.py
+++ b/examples/features/parallel_agents.py
@@ -11,13 +11,15 @@ load_dotenv()
 from langchain_openai import ChatOpenAI
 
 from browser_use.agent.service import Agent
-from browser_use.browser import BrowserSession
+from browser_use.browser import BrowserProfile, BrowserSession
 
 browser_session = BrowserSession(
-	keep_alive=True,
-	headless=False,
-	save_recording_path='./tmp/recordings',
-	user_data_dir='~/.config/browseruse/profiles/default',
+	browser_profile=BrowserProfile(
+		keep_alive=True,
+		headless=False,
+		record_video_dir='./tmp/recordings',
+		user_data_dir='~/.config/browseruse/profiles/default',
+	)
 )
 llm = ChatOpenAI(model='gpt-4o')
 
diff --git a/examples/features/result_processing.py b/examples/features/result_processing.py
index b65212afd..38c12847b 100644
--- a/examples/features/result_processing.py
+++ b/examples/features/result_processing.py
@@ -22,7 +22,7 @@ async def main():
 	async with BrowserSession(
 		browser_profile=BrowserProfile(
 			headless=False,
-			trace_path='./tmp/result_processing',
+			traces_dir='./tmp/result_processing',
 			window_size={'width': 1280, 'height': 1000},
 			user_data_dir='~/.config/browseruse/profiles/default',
 		)
diff --git a/examples/features/save_trace.py b/examples/features/save_trace.py
index 65e1c0803..b0701fd07 100644
--- a/examples/features/save_trace.py
+++ b/examples/features/save_trace.py
@@ -19,7 +19,7 @@ llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
 async def main():
 	browser_session = BrowserSession(
 		browser_profile=BrowserProfile(
-			trace_path='./tmp/traces/',
+			traces_dir='./tmp/traces/',
 			user_data_dir='~/.config/browseruse/profiles/default',
 		)
 	)
diff --git a/examples/features/sensitive_data.py b/examples/features/sensitive_data.py
index 3924a612b..c38be553c 100644
--- a/examples/features/sensitive_data.py
+++ b/examples/features/sensitive_data.py
@@ -11,6 +11,7 @@ load_dotenv()
 from langchain_openai import ChatOpenAI
 
 from browser_use import Agent
+from browser_use.browser import BrowserProfile
 
 # Initialize the model
 llm = ChatOpenAI(
@@ -25,7 +26,8 @@ llm = ChatOpenAI(
 company_credentials = {'company_username': 'user@example.com', 'company_password': 'securePassword123'}
 
 # Map the same credentials to multiple domains for secure access control
-sensitive_data = {
+# Type annotation to satisfy pyright
+sensitive_data: dict[str, str | dict[str, str]] = {
 	'https://example.com': company_credentials,
 	'https://admin.example.com': company_credentials,
 	'https://*.example-staging.com': company_credentials,
@@ -40,8 +42,10 @@ task = 'Go to example.com and login with company_username and company_password'
 from browser_use.browser.session import BrowserSession
 
 browser_session = BrowserSession(
-	allowed_domains=list(sensitive_data.keys())
-	+ ['https://*.trusted-partner.com']  # Domain patterns from sensitive_data + additional allowed domains
+	browser_profile=BrowserProfile(
+		allowed_domains=list(sensitive_data.keys())
+		+ ['https://*.trusted-partner.com']  # Domain patterns from sensitive_data + additional allowed domains
+	)
 )
 
 agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data, browser_session=browser_session)
diff --git a/examples/integrations/browserbase_stagehand.py b/examples/integrations/browserbase_stagehand.py
index e05a12636..73764368d 100644
--- a/examples/integrations/browserbase_stagehand.py
+++ b/examples/integrations/browserbase_stagehand.py
@@ -1,3 +1,16 @@
+"""
+EXPERIMENTAL: Integration example with Stagehand (browserbase)
+
+This example shows how to combine browser-use with Stagehand for advanced browser automation.
+Note: This requires the stagehand-py library to be installed separately:
+    pip install stagehand-py
+
+The exact API may vary depending on the stagehand-py version.
+Please refer to the official Stagehand documentation for the latest usage:
+    https://pypi.org/project/stagehand-py/
+    https://github.com/browserbase/stagehand-python-examples/
+"""
+
 import asyncio
 import os
 
@@ -5,7 +18,7 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-from stagehand import Stagehand, StagehandConfig
+from stagehand import Stagehand, StagehandConfig  # type: ignore
 
 from browser_use.agent.service import Agent
 
@@ -14,18 +27,14 @@ async def main():
 	# Configure Stagehand
 	# https://pypi.org/project/stagehand-py/
 	# https://github.com/browserbase/stagehand-python-examples/blob/main/agent_example.py
-	config = StagehandConfig(
-		env='BROWSERBASE',
-		api_key=os.getenv('BROWSERBASE_API_KEY'),
-		project_id=os.getenv('BROWSERBASE_PROJECT_ID'),
-		headless=False,
-		dom_settle_timeout_ms=3000,
-		model_name='gpt-4o',
-		self_heal=True,
-		wait_for_captcha_solves=True,
-		system_prompt='You are a browser automation assistant that helps users navigate websites effectively.',
-		model_client_options={'model_api_key': os.getenv('OPENAI_API_KEY')},
-		verbose=2,
+	# Note: This example requires the stagehand-py library to be installed
+	# pip install stagehand-py
+
+	# Create StagehandConfig with correct parameters
+	# The exact parameters depend on the stagehand-py version
+	config = StagehandConfig(  # type: ignore
+		apiKey=os.getenv('BROWSERBASE_API_KEY'),
+		projectId=os.getenv('BROWSERBASE_PROJECT_ID'),
 	)
 
 	# Create a Stagehand client using the configuration object.
@@ -40,18 +49,21 @@ async def main():
 	print(f'\nCreated new session: {stagehand.session_id}')
 	print(f'🌐 View your live browser: https://www.browserbase.com/sessions/{stagehand.session_id}')
 
-	await stagehand.page.goto('https://google.com/')
-
-	await stagehand.page.act('search for openai')
+	# Check if stagehand has a page attribute
+	if hasattr(stagehand, 'page') and stagehand.page:
+		await stagehand.page.goto('https://google.com/')
+		await stagehand.page.act('search for openai')
+	else:
+		print('Warning: Stagehand page not available')
 
 	# Combine with Browser Use
-	agent = Agent(task='click the first result', page=stagehand.page)
+	agent = Agent(task='click the first result', page=stagehand.page)  # type: ignore
 	await agent.run()
 
 	# go back and forth
-	await stagehand.page.act('open the 3 first links on the page in new tabs')
+	await stagehand.page.act('open the 3 first links on the page in new tabs')  # type: ignore
 
-	await Agent(task='click the first result', page=stagehand.page).run()
+	await Agent(task='click the first result', page=stagehand.page).run()  # type: ignore
 
 
 if __name__ == '__main__':
diff --git a/examples/integrations/discord/discord_api.py b/examples/integrations/discord/discord_api.py
index 676014c12..ba9484a2d 100644
--- a/examples/integrations/discord/discord_api.py
+++ b/examples/integrations/discord/discord_api.py
@@ -7,8 +7,8 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-import discord
-from discord.ext import commands
+import discord  # type: ignore
+from discord.ext import commands  # type: ignore
 from langchain_core.language_models.chat_models import BaseChatModel
 
 from browser_use.agent.service import Agent
@@ -56,7 +56,7 @@ class DiscordBot(commands.Bot):
 		self.browser_profile = browser_profile
 
 		# Define intents.
-		intents = discord.Intents.default()
+		intents = discord.Intents.default()  # type: ignore
 		intents.message_content = True  # Enable message content intent
 		intents.members = True  # Enable members intent for user info
 
diff --git a/examples/integrations/slack/slack_api.py b/examples/integrations/slack/slack_api.py
index 06d045cc1..68cc1f7a4 100644
--- a/examples/integrations/slack/slack_api.py
+++ b/examples/integrations/slack/slack_api.py
@@ -11,9 +11,9 @@ load_dotenv()
 
 from fastapi import Depends, FastAPI, HTTPException, Request
 from langchain_core.language_models.chat_models import BaseChatModel
-from slack_sdk.errors import SlackApiError
-from slack_sdk.signature import SignatureVerifier
-from slack_sdk.web.async_client import AsyncWebClient
+from slack_sdk.errors import SlackApiError  # type: ignore
+from slack_sdk.signature import SignatureVerifier  # type: ignore
+from slack_sdk.web.async_client import AsyncWebClient  # type: ignore
 
 from browser_use.agent.service import Agent
 from browser_use.browser import BrowserProfile, BrowserSession
diff --git a/examples/models/azure_openai.py b/examples/models/azure_openai.py
index 1e8da90b9..cb260997f 100644
--- a/examples/models/azure_openai.py
+++ b/examples/models/azure_openai.py
@@ -15,6 +15,7 @@ from dotenv import load_dotenv
 load_dotenv()
 
 from langchain_openai import AzureChatOpenAI
+from pydantic import SecretStr
 
 from browser_use import Agent
 
@@ -28,7 +29,7 @@ if not azure_openai_api_key or not azure_openai_endpoint:
 # Initialize the Azure OpenAI client
 llm = AzureChatOpenAI(
 	model='gpt-4o',
-	api_key=azure_openai_api_key,
+	api_key=SecretStr(azure_openai_api_key) if azure_openai_api_key else None,
 	azure_endpoint=azure_openai_endpoint,  # Corrected to use azure_endpoint instead of openai_api_base
 	api_version='2024-08-01-preview',  # Explicitly set the API version here
 )
diff --git a/examples/models/bedrock_claude.py b/examples/models/bedrock_claude.py
index 468e60108..2e5ef44f7 100644
--- a/examples/models/bedrock_claude.py
+++ b/examples/models/bedrock_claude.py
@@ -1,3 +1,4 @@
+# pyright: reportMissingImports=false
 """
 Automated news analysis and sentiment scoring using Bedrock.
 
@@ -17,9 +18,9 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-import boto3
+import boto3  # type: ignore
 from botocore.config import Config
-from langchain_aws import ChatBedrockConverse
+from langchain_aws import ChatBedrockConverse  # type: ignore
 
 from browser_use import Agent
 from browser_use.browser import BrowserProfile, BrowserSession
diff --git a/examples/models/llama4-groq.py b/examples/models/llama4-groq.py
index dfcfc2691..a6bbe6095 100644
--- a/examples/models/llama4-groq.py
+++ b/examples/models/llama4-groq.py
@@ -9,13 +9,15 @@ from dotenv import load_dotenv
 load_dotenv()
 
 from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
 
 from browser_use import Agent
 
+groq_api_key = os.environ.get('GROQ_API_KEY')
 llm = ChatOpenAI(
 	model='meta-llama/llama-4-maverick-17b-128e-instruct',
 	base_url='https://api.groq.com/openai/v1',
-	api_key=os.environ.get('GROQ_API_KEY'),
+	api_key=SecretStr(groq_api_key) if groq_api_key else None,
 	temperature=0.0,
 )
 
diff --git a/examples/ui/gradio_demo.py b/examples/ui/gradio_demo.py
index 9b4a86a76..635f3f4ee 100644
--- a/examples/ui/gradio_demo.py
+++ b/examples/ui/gradio_demo.py
@@ -1,3 +1,4 @@
+# pyright: reportMissingImports=false
 import asyncio
 import os
 import sys
@@ -10,7 +11,7 @@ from dotenv import load_dotenv
 load_dotenv()
 
 # Third-party imports
-import gradio as gr
+import gradio as gr  # type: ignore
 from langchain_openai import ChatOpenAI
 from rich.console import Console
 from rich.panel import Panel
@@ -52,6 +53,8 @@ def parse_agent_history(history_str: str) -> None:
 			console.print(panel)
 			console.print()
 
+	return None
+
 
 async def run_browser_task(
 	task: str,
@@ -70,8 +73,8 @@ async def run_browser_task(
 			llm=ChatOpenAI(model='gpt-4o'),
 		)
 		result = await agent.run()
-		#  TODO: The result cloud be parsed better
-		return result
+		#  TODO: The result could be parsed better
+		return str(result)
 	except Exception as e:
 		return f'Error: {str(e)}'
 
diff --git a/examples/ui/streamlit_demo.py b/examples/ui/streamlit_demo.py
index d18e11a11..aa0316a20 100644
--- a/examples/ui/streamlit_demo.py
+++ b/examples/ui/streamlit_demo.py
@@ -15,7 +15,7 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-import streamlit as st
+import streamlit as st  # type: ignore
 
 from browser_use import Agent
 from browser_use.browser import BrowserSession
@@ -48,6 +48,7 @@ def get_llm(provider: str):
 	else:
 		st.error(f'Unsupported provider: {provider}')
 		st.stop()
+		return None  # Never reached, but helps with type checking
 
 
 # Function to initialize the agent
@@ -58,7 +59,7 @@ def initialize_agent(query: str, provider: str):
 
 	return Agent(
 		task=query,
-		llm=llm,
+		llm=llm,  # type: ignore
 		controller=controller,
 		browser_session=browser_session,
 		use_vision=True,
diff --git a/examples/use-cases/find_and_apply_to_jobs.py b/examples/use-cases/find_and_apply_to_jobs.py
index c6a01b270..c63102f7e 100644
--- a/examples/use-cases/find_and_apply_to_jobs.py
+++ b/examples/use-cases/find_and_apply_to_jobs.py
@@ -20,7 +20,7 @@ load_dotenv()
 
 from langchain_openai import AzureChatOpenAI
 from pydantic import BaseModel, SecretStr
-from PyPDF2 import PdfReader
+from PyPDF2 import PdfReader  # type: ignore
 
 from browser_use import ActionResult, Agent, Controller
 from browser_use.browser import BrowserProfile, BrowserSession
diff --git a/examples/use-cases/google_sheets.py b/examples/use-cases/google_sheets.py
index ac42d2031..5adf496c6 100644
--- a/examples/use-cases/google_sheets.py
+++ b/examples/use-cases/google_sheets.py
@@ -39,8 +39,8 @@ async def main():
 		browser_profile=BrowserProfile(
 			executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
 			user_data_dir='~/.config/browseruse/profiles/default',
+			keep_alive=True,
 		),
-		keep_alive=True,
 	)
 
 	async with browser_session:
diff --git a/examples/use-cases/play_chess.py b/examples/use-cases/play_chess.py
index 5294369f3..1e917e86c 100755
--- a/examples/use-cases/play_chess.py
+++ b/examples/use-cases/play_chess.py
@@ -16,7 +16,7 @@ load_dotenv()
 import asyncio
 import logging
 
-import chess
+import chess  # type: ignore
 from bs4 import BeautifulSoup
 from langchain_openai import ChatOpenAI
 from pydantic import BaseModel, Field
@@ -64,7 +64,7 @@ def parse_transform(style: str) -> tuple[float, float] | None:
 		return x_px_str, y_px_str
 	except Exception as e:
 		logger.error(f'Error parsing transform style: {e}')
-		return None, None
+		return None
 
 
 def algebraic_to_pixels(square: str, square_size: float) -> tuple[str, str]:
@@ -107,9 +107,12 @@ async def calculate_square_size(page) -> float | None:
 			raise ValueError('No pieces found.')
 		x_coords: set[float] = set()
 		for piece in pieces:
-			style = piece.get('style')
+			if hasattr(piece, 'get'):
+				style = piece.get('style')  # type: ignore
+			else:
+				continue
 			if style:
-				coords = parse_transform(style)
+				coords = parse_transform(style)  # type: ignore
 				if coords:
 					x_coords.add(coords[0])
 
@@ -151,7 +154,7 @@ def create_fen_board(board_state: dict) -> str:
 	return fen
 
 
-async def get_current_board_info(page) -> tuple[str | None, float]:
+async def get_current_board_info(page) -> tuple[str | None, float | None]:
 	"""Reads the current board HTML and returns FEN string and square size."""
 	board_state = {}
 	board_html = ''
@@ -172,16 +175,18 @@ async def get_current_board_info(page) -> tuple[str | None, float]:
 	soup = BeautifulSoup(board_html, 'html.parser')
 	pieces = soup.find_all('piece')
 	for piece in pieces:
-		style = piece.get('style')
-		class_ = piece.get('class')
+		if not hasattr(piece, 'get'):
+			continue
+		style = piece.get('style')  # type: ignore
+		class_ = piece.get('class')  # type: ignore
 
 		if style and class_:
-			coords = parse_transform(style)
+			coords = parse_transform(style)  # type: ignore
 			if coords:
 				x_px, y_px = coords
 				try:
 					square = pixels_to_algebraic(x_px, y_px, square_size)
-					board_state[square] = get_piece_symbol(class_)
+					board_state[square] = get_piece_symbol(class_)  # type: ignore
 				except ValueError as ve:
 					logger.error(f'Error: {ve}')
 
@@ -257,7 +262,7 @@ async def play_move(params: PlayMoveParams, browser: BrowserContext):
 
 	try:
 		current_fen, square_size = await get_current_board_info(page)
-		if not current_fen or not square_size:
+		if not current_fen or square_size is None:
 			return ActionResult(extracted_content='Failed to get current FEN or square size to play move.')
 
 		board = chess.Board(current_fen)
diff --git a/examples/use-cases/post-twitter.py b/examples/use-cases/post-twitter.py
index 3472d4712..c57a55556 100644
--- a/examples/use-cases/post-twitter.py
+++ b/examples/use-cases/post-twitter.py
@@ -55,8 +55,11 @@ class TwitterConfig:
 
 
 # Customize these settings
+openai_key = os.getenv('OPENAI_API_KEY')
+assert openai_key is not None, 'OPENAI_API_KEY must be set'
+
 config = TwitterConfig(
-	openai_api_key=os.getenv('OPENAI_API_KEY'),
+	openai_api_key=openai_key,
 	chrome_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',  # This is for MacOS (Chrome)
 	target_user='XXXXX',
 	message='XXXXX',
@@ -66,7 +69,9 @@ config = TwitterConfig(
 
 
 def create_twitter_agent(config: TwitterConfig) -> Agent:
-	llm = ChatOpenAI(model=config.model, api_key=config.openai_api_key)
+	from pydantic import SecretStr
+
+	llm = ChatOpenAI(model=config.model, api_key=SecretStr(config.openai_api_key))
 
 	browser_profile = BrowserProfile(
 		headless=config.headless,
@@ -80,7 +85,7 @@ def create_twitter_agent(config: TwitterConfig) -> Agent:
 	full_message = f'@{config.target_user} {config.message}'
 
 	# Create the agent with detailed instructions
-	return Agent(
+	agent = Agent(
 		task=f"""Navigate to Twitter and create a post and reply to a tweet.
 
         Here are the specific steps:
@@ -106,12 +111,12 @@ def create_twitter_agent(config: TwitterConfig) -> Agent:
 		controller=controller,
 		browser_session=browser_session,
 	)
+	return agent
 
 
 async def post_tweet(agent: Agent):
 	try:
 		await agent.run(max_steps=100)
-		agent.create_history_gif()
 		print('Tweet posted successfully!')
 	except Exception as e:
 		print(f'Error posting tweet: {str(e)}')
diff --git a/pyproject.toml b/pyproject.toml
index e37c2ff2f..7873a4ce7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "browser-use"
 description = "Make websites accessible for AI agents"
 authors = [{ name = "Gregor Zunic" }]
-version = "0.3.1"
+version = "0.3.2"
 readme = "README.md"
 requires-python = ">=3.11,<4.0"
 classifiers = [
@@ -13,8 +13,7 @@ classifiers = [
 dependencies = [
     "aiofiles>=24.1.0",
     "anyio>=4.9.0",
-    "bubus>=1.1.0",
-    "faiss-cpu>=1.11.0",
+    "bubus>=1.1.2",
     "google-api-core>=2.25.0",
     "httpx>=0.28.1",
     "langchain-anthropic==0.3.15",
@@ -53,6 +52,8 @@ dependencies = [
 [project.optional-dependencies]
 memory = [
     # sentence-transformers: depends on pytorch, which does not support python 3.13 yet
+    # faiss-cpu: >= 1.11.0 breaks on some macOS hosts, make sure to test before upgrading
+    "faiss-cpu>=1.10.0",
     "sentence-transformers>=4.0.2",
 ]
 cli = [
@@ -63,7 +64,7 @@ cli = [
 examples = [
     # botocore: only needed for Bedrock Claude boto3 examples/models/bedrock_claude.py
     "botocore>=1.37.23",
-    "langchain-aws>=0.2.24",
+    # "langchain-aws>=0.2.24",  # depends on version of numpy that doesnt have python 3.12 wheels yet, breaks CI
     "imgcat>=0.6.0",
     "stagehand-py>=0.3.6",
     "browserbase>=0.4.0",
@@ -114,7 +115,9 @@ docstring-code-line-length = 140
 skip-magic-trailing-comma = false
 
 [tool.pyright]
-typeCheckingMode = "off"
+typeCheckingMode = "basic"
+exclude = ["tests/old/", ".venv/", ".git/", "__pycache__/"]
+
 
 [tool.hatch.build]
 include = [
@@ -123,6 +126,7 @@ include = [
     "!browser_use/**/tests.py",
     "browser_use/agent/system_prompt.md",
     "browser_use/dom/buildDomTree.js",
+    "!tests/**/*.py",
 ]
 
 [tool.pytest.ini_options]
@@ -153,6 +157,15 @@ log_level = "DEBUG"
 allow-direct-references = true
 
 [tool.uv]
+# required-environments = [
+#     "sys_platform == 'darwin' and platform_machine == 'arm64'",
+#     "sys_platform == 'darwin' and platform_machine == 'x86_64'",
+#     "sys_platform == 'linux' and platform_machine == 'x86_64'",
+#     "sys_platform == 'linux' and platform_machine == 'aarch64'",
+#     # "sys_platform == 'linux' and platform_machine == 'arm64'",  # no pytorch wheels available yet
+#     "sys_platform == 'win32' and platform_machine == 'x86_64'",
+#     # "sys_platform == 'win32' and platform_machine == 'arm64'",  # no pytorch wheels available yet
+# ]
 dev-dependencies = [
     "ruff>=0.11.2",
     "tokencost>=0.1.16",
@@ -169,6 +182,6 @@ dev-dependencies = [
     "codespell>=2.4.1",
     "pyright>=1.1.399",
     "ty>=0.0.1a1",
-    "pytest-xdist>=3.7.0"
+    "pytest-xdist>=3.7.0",
     # "pytest-playwright-asyncio>=0.7.0",  # not actually needed I think
 ]
diff --git a/tests/ci/conftest.py b/tests/ci/conftest.py
new file mode 100644
index 000000000..040853396
--- /dev/null
+++ b/tests/ci/conftest.py
@@ -0,0 +1,226 @@
+"""
+Pytest configuration for browser-use CI tests.
+
+Sets up environment variables to ensure tests never connect to production services.
+"""
+
+import os
+import tempfile
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+from dotenv import load_dotenv
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AIMessage
+from pytest_httpserver import HTTPServer
+
+# Load environment variables before any imports
+load_dotenv()
+
+
+# Skip LLM API key verification for tests
+os.environ['SKIP_LLM_API_KEY_VERIFICATION'] = 'true'
+
+from bubus import BaseEvent
+
+from browser_use import Agent
+from browser_use.browser import BrowserProfile, BrowserSession
+from browser_use.sync.service import CloudSync
+
+
+@pytest.fixture(autouse=True)
+def setup_test_environment():
+	"""
+	Automatically set up test environment for all tests.
+	"""
+
+	# Create a temporary directory for test config
+	config_dir = tempfile.mkdtemp(prefix='browseruse_tests_')
+
+	original_env = {}
+	test_env_vars = {
+		'SKIP_LLM_API_KEY_VERIFICATION': 'true',
+		'ANONYMIZED_TELEMETRY': 'false',
+		'BROWSER_USE_CLOUD_SYNC': 'true',
+		'BROWSER_USE_CLOUD_API_URL': 'http://placeholder-will-be-replaced-by-specific-test-fixtures',
+		'BROWSER_USE_CLOUD_UI_URL': 'http://placeholder-will-be-replaced-by-specific-test-fixtures',
+		'BROWSER_USE_CONFIG_DIR': config_dir,
+	}
+
+	for key, value in test_env_vars.items():
+		original_env[key] = os.environ.get(key)
+		os.environ[key] = value
+
+	yield
+
+	# Restore original environment
+	for key, value in original_env.items():
+		if value is None:
+			os.environ.pop(key, None)
+		else:
+			os.environ[key] = value
+
+
+# not a fixture, mock_llm() provides this in a fixture below, this is a helper so that it can accept args
+def create_mock_llm(actions=None):
+	"""Create a mock LLM that returns specified actions or a default done action.
+
+	Args:
+		actions: Optional list of JSON strings representing actions to return in sequence.
+			If not provided, returns a single done action.
+			After all actions are exhausted, returns a done action.
+
+	Returns:
+		Mock LLM that will return the actions in order, or just a done action if no actions provided.
+	"""
+	llm = AsyncMock(spec=BaseChatModel)
+	llm.model_name = 'mock-llm'
+	llm._verified_api_keys = True
+	llm._verified_tool_calling_method = 'raw'
+	# llm._verified_tool_calling_method = 'function_calling'
+
+	# Default done action
+	default_done_action = """
+	{
+		"thinking": "null",
+		"evaluation_previous_goal": "Successfully completed the task",
+		"memory": "Task completed",
+		"next_goal": "Task completed",
+		"action": [
+			{
+				"done": {
+					"text": "Task completed successfully",
+					"success": true
+				}
+			}
+		]
+	}
+	"""
+
+	if actions is None:
+		# No actions provided, just return done action
+		async def async_invoke(*args, **kwargs):
+			return AIMessage(content=default_done_action)
+
+		llm.invoke.return_value = AIMessage(content=default_done_action)
+		llm.ainvoke.side_effect = async_invoke
+	else:
+		# Actions provided, return them in sequence with structured output support
+		action_index = 0
+
+		def get_next_action():
+			nonlocal action_index
+			if action_index < len(actions):
+				action = actions[action_index]
+				action_index += 1
+				return action
+			else:
+				return default_done_action
+
+		async def mock_ainvoke(*args, **kwargs):
+			return AIMessage(content=get_next_action())
+
+		def mock_invoke(*args, **kwargs):
+			return AIMessage(content=get_next_action())
+
+		llm.invoke.side_effect = mock_invoke
+		llm.ainvoke.side_effect = mock_ainvoke
+
+		# Mock the with_structured_output method to return parsed objects
+		structured_llm = MagicMock()
+
+		async def mock_structured_ainvoke(*args, **kwargs):
+			# The agent will create its own AgentOutput and ActionModel classes
+			# We return the raw response and let the agent parse it
+			return {
+				'raw': AIMessage(content=get_next_action()),
+				'parsed': None,  # Let the agent parse it from the raw JSON
+			}
+
+		structured_llm.ainvoke = AsyncMock(side_effect=mock_structured_ainvoke)
+		llm.with_structured_output = lambda *args, **kwargs: structured_llm
+
+	return llm
+
+
+@pytest.fixture(scope='module')
+async def browser_session():
+	"""Create a real browser session for testing"""
+	session = BrowserSession(
+		browser_profile=BrowserProfile(
+			headless=True,
+			user_data_dir=None,  # Use temporary directory
+			keep_alive=True,
+		)
+	)
+	await session.start()
+	yield session
+	await session.stop()
+
+
+@pytest.fixture(scope='function')
+def cloud_sync(httpserver: HTTPServer):
+	"""
+	Create a CloudSync instance configured for testing.
+
+	This fixture creates a real CloudSync instance and sets up the test environment
+	to use the httpserver URLs.
+	"""
+
+	# Set up test environment
+	test_http_server_url = httpserver.url_for('')
+	os.environ['BROWSER_USE_CLOUD_API_URL'] = test_http_server_url
+	os.environ['BROWSER_USE_CLOUD_UI_URL'] = test_http_server_url
+	os.environ['BROWSER_USE_CLOUD_SYNC'] = 'true'
+
+	# Create CloudSync with test server URL
+	cloud_sync = CloudSync(
+		base_url=test_http_server_url,
+		enable_auth=False,  # Disable auth for most tests, they can override this if needed
+	)
+
+	return cloud_sync
+
+
+@pytest.fixture(scope='function')
+def mock_llm():
+	"""Create a mock LLM that just returns the done action if queried"""
+	return create_mock_llm(actions=None)
+
+
+@pytest.fixture(scope='function')
+def agent_with_cloud(browser_session, mock_llm, cloud_sync):
+	"""Create agent with cloud sync enabled (using real CloudSync)."""
+	agent = Agent(
+		task='Test task',
+		llm=mock_llm,
+		browser_session=browser_session,
+		cloud_sync=cloud_sync,
+	)
+	return agent
+
+
+@pytest.fixture(scope='function')
+def event_collector():
+	"""Helper to collect all events emitted during tests"""
+	events = []
+	event_order = []
+
+	class EventCollector:
+		def __init__(self):
+			self.events = events
+			self.event_order = event_order
+
+		async def collect_event(self, event: BaseEvent):
+			self.events.append(event)
+			self.event_order.append(event.event_type)
+			return 'collected'
+
+		def get_events_by_type(self, event_type: str) -> list[BaseEvent]:
+			return [e for e in self.events if e.event_type == event_type]
+
+		def clear(self):
+			self.events.clear()
+			self.event_order.clear()
+
+	return EventCollector()
diff --git a/tests/ci/evaluate_tasks.py b/tests/ci/evaluate_tasks.py
index 53c4e46fd..24f70ddbe 100644
--- a/tests/ci/evaluate_tasks.py
+++ b/tests/ci/evaluate_tasks.py
@@ -141,7 +141,7 @@ Reply in JSON with keys: success (true/false), explanation (string).
 If the agent provided no output, explain what might have gone wrong.
 """
 		structured_llm = judge_llm.with_structured_output(JudgeResponse)
-		judge_response = await structured_llm.ainvoke(judge_prompt)
+		judge_response: JudgeResponse = await structured_llm.ainvoke(judge_prompt)  # type: ignore[assignment]
 
 		result = {
 			'file': os.path.basename(task_file),
diff --git a/tests/ci/mocks.py b/tests/ci/mocks.py
deleted file mode 100644
index 7399a135e..000000000
--- a/tests/ci/mocks.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""Mock utilities for testing browser-use."""
-
-from unittest.mock import AsyncMock
-
-from langchain_core.language_models.chat_models import BaseChatModel
-from langchain_core.messages import AIMessage
-
-
-def create_mock_llm(actions=None):
-	"""Create a mock LLM that returns specified actions or a default done action.
-
-	Args:
-		actions: Optional list of JSON strings representing actions to return in sequence.
-			If not provided, returns a single done action.
-			After all actions are exhausted, returns a done action.
-
-	Returns:
-		Mock LLM that will return the actions in order, or just a done action if no actions provided.
-	"""
-	mock = AsyncMock(spec=BaseChatModel)
-	mock._verified_api_keys = True
-	mock._verified_tool_calling_method = 'raw'
-	mock.model_name = 'mock-llm'
-
-	# Default done action
-	default_done_action = """
-	{
-		"thinking": "null",
-		"evaluation_previous_goal": "Successfully completed the task",
-		"memory": "Task completed",
-		"next_goal": "Task completed",
-		"action": [
-			{
-				"done": {
-					"text": "Task completed successfully",
-					"success": true
-				}
-			}
-		]
-	}
-	"""
-
-	if actions is None:
-		# No actions provided, just return done action
-		mock.invoke.return_value = AIMessage(content=default_done_action)
-
-		async def async_invoke(*args, **kwargs):
-			return AIMessage(content=default_done_action)
-
-		mock.ainvoke.side_effect = async_invoke
-	else:
-		# Actions provided, return them in sequence
-		action_index = 0
-
-		def get_next_action():
-			nonlocal action_index
-			if action_index < len(actions):
-				action = actions[action_index]
-				action_index += 1
-				return action
-			else:
-				return default_done_action
-
-		# Mock the invoke method
-		def mock_invoke(*args, **kwargs):
-			return AIMessage(content=get_next_action())
-
-		mock.invoke.side_effect = mock_invoke
-
-		# Create an async version
-		async def mock_ainvoke(*args, **kwargs):
-			return AIMessage(content=get_next_action())
-
-		mock.ainvoke.side_effect = mock_ainvoke
-
-	return mock
diff --git a/tests/ci/test_action_parameter_injection.py b/tests/ci/test_action_parameter_injection.py
index 945a07579..b67a57a6c 100644
--- a/tests/ci/test_action_parameter_injection.py
+++ b/tests/ci/test_action_parameter_injection.py
@@ -58,9 +58,10 @@ class TestBrowserContext:
 	async def browser_session(self):
 		"""Create and provide a BrowserSession instance with security disabled."""
 		browser_session = BrowserSession(
-			# browser_profile=BrowserProfile(...),
-			headless=True,
-			user_data_dir=None,
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=None,
+			)
 		)
 		await browser_session.start()
 		yield browser_session
@@ -356,7 +357,7 @@ class TestBrowserContext:
 		assert 'simple_action' in action_model.model_fields
 
 		# Create an instance with the simple_action
-		action_instance = action_model(simple_action={})
+		action_instance = action_model(simple_action={})  # type: ignore[call-arg]
 
 		# Test that model_dump works correctly
 		dumped = action_instance.model_dump(exclude_unset=True)
diff --git a/tests/ci/test_agent_multiprocessing.py b/tests/ci/test_agent_multiprocessing.py
index 9d3acffa2..71b08ca40 100644
--- a/tests/ci/test_agent_multiprocessing.py
+++ b/tests/ci/test_agent_multiprocessing.py
@@ -23,7 +23,7 @@ from langchain_core.messages import AIMessage
 from browser_use import Agent, setup_logging
 from browser_use.browser import BrowserProfile, BrowserSession
 from browser_use.browser.types import async_playwright
-from tests.ci.mocks import create_mock_llm
+from tests.ci.conftest import create_mock_llm
 
 # Set up test logging
 setup_logging()
@@ -147,9 +147,11 @@ class TestParallelism:
 
 		# Create a shared browser session
 		browser_session = BrowserSession(
-			headless=True,
-			user_data_dir=None,  # Use temp directory
-			keep_alive=True,
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=None,  # Use temp directory
+				keep_alive=True,
+			)
 		)
 
 		try:
@@ -197,9 +199,11 @@ class TestParallelism:
 
 		# Create a shared browser session
 		browser_session = BrowserSession(
-			headless=True,
-			user_data_dir=None,  # Use temp directory
-			keep_alive=True,
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=None,  # Use temp directory
+				keep_alive=True,
+			)
 		)
 
 		try:
@@ -400,9 +404,11 @@ class TestParallelism:
 
 		# Create shared browser session
 		shared_session = BrowserSession(
-			headless=True,
-			user_data_dir=None,
-			keep_alive=True,
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=None,
+				keep_alive=True,
+			)
 		)
 
 		try:
@@ -459,9 +465,11 @@ class TestParallelism:
 
 		# Create a session with keep_alive
 		session = BrowserSession(
-			headless=True,
-			user_data_dir=None,
-			keep_alive=True,
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=None,
+				keep_alive=True,
+			)
 		)
 
 		try:
@@ -514,13 +522,15 @@ class TestParallelism:
 
 			# Create session with existing playwright objects
 			browser_session = BrowserSession(
-				page=page,
+				browser_profile=BrowserProfile(
+					headless=True,
+					user_data_dir=None,
+					keep_alive=False,
+				),
+				agent_current_page=page,
 				browser_context=context,
 				browser=browser,
 				playwright=playwright,
-				headless=True,
-				user_data_dir=None,
-				keep_alive=False,
 			)
 
 			# Create mock LLM
diff --git a/tests/ci/test_agent_sensitive_data.py b/tests/ci/test_agent_sensitive_data.py
index d2b8f664a..4edada7aa 100644
--- a/tests/ci/test_agent_sensitive_data.py
+++ b/tests/ci/test_agent_sensitive_data.py
@@ -42,7 +42,12 @@ def test_replace_sensitive_data_with_missing_keys(registry, caplog):
 	# Set log level to capture warnings
 	import logging
 
-	caplog.set_level(logging.WARNING)
+	# Temporarily enable propagation for browser_use logger to capture logs
+	browser_use_logger = logging.getLogger('browser_use')
+	original_propagate = browser_use_logger.propagate
+	browser_use_logger.propagate = True
+
+	caplog.set_level(logging.WARNING, logger='browser_use.controller.registry.service')
 
 	# Create a simple Pydantic model with sensitive data placeholders
 	params = SensitiveParams(text='Please enter <secret>username</secret> and <secret>password</secret>')
@@ -83,13 +88,21 @@ def test_replace_sensitive_data_with_missing_keys(registry, caplog):
 	assert 'password' in caplog.text
 	caplog.clear()
 
+	# Restore original propagate setting
+	browser_use_logger.propagate = original_propagate
+
 
 def test_simple_domain_specific_sensitive_data(registry, caplog):
 	"""Test the basic functionality of domain-specific sensitive data replacement"""
 	# Set log level to capture warnings
 	import logging
 
-	caplog.set_level(logging.WARNING)
+	# Temporarily enable propagation for browser_use logger to capture logs
+	browser_use_logger = logging.getLogger('browser_use')
+	original_propagate = browser_use_logger.propagate
+	browser_use_logger.propagate = True
+
+	caplog.set_level(logging.WARNING, logger='browser_use.controller.registry.service')
 
 	# Create a simple Pydantic model with sensitive data placeholders
 	params = SensitiveParams(text='Please enter <secret>username</secret> and <secret>password</secret>')
@@ -115,6 +128,9 @@ def test_simple_domain_specific_sensitive_data(registry, caplog):
 	assert 'password' in caplog.text  # Only password should be logged as missing
 	caplog.clear()
 
+	# Restore original propagate setting
+	browser_use_logger.propagate = original_propagate
+
 
 def test_match_url_with_domain_pattern():
 	"""Test that the domain pattern matching utility works correctly"""
diff --git a/tests/ci/test_browser_session_cookies.py b/tests/ci/test_browser_session_cookies.py
index d3da458e9..664fd9f65 100644
--- a/tests/ci/test_browser_session_cookies.py
+++ b/tests/ci/test_browser_session_cookies.py
@@ -62,7 +62,7 @@ class TestBrowserSessionCookies:
 	@pytest.fixture
 	async def browser_profile_with_cookies(self, temp_cookies_file):
 		"""Create a BrowserProfile with cookies_file set."""
-		profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=str(temp_cookies_file))
+		profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=temp_cookies_file)
 		yield profile
 
 	@pytest.fixture
@@ -158,7 +158,7 @@ class TestBrowserSessionCookies:
 	async def test_nonexistent_cookies_file(self):
 		"""Test that browser starts normally when cookies_file doesn't exist."""
 		# Use a non-existent file path
-		profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file='/tmp/nonexistent_cookies.json')
+		profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=Path('/tmp/nonexistent_cookies.json'))
 
 		session = BrowserSession(browser_profile=profile)
 		# Should start without errors
@@ -176,7 +176,7 @@ class TestBrowserSessionCookies:
 		invalid_file = tmp_path / 'invalid_cookies.json'
 		invalid_file.write_text('not valid json')
 
-		profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=str(invalid_file))
+		profile = BrowserProfile(headless=True, user_data_dir=None, cookies_file=invalid_file)
 
 		session = BrowserSession(browser_profile=profile)
 		# Should start without errors (warning logged)
@@ -194,7 +194,7 @@ class TestBrowserSessionCookies:
 		profile = BrowserProfile(
 			headless=True,
 			user_data_dir=None,
-			cookies_file='./test_cookies.json',  # Relative path
+			cookies_file=Path('./test_cookies.json'),  # Relative path
 			downloads_path=browser_profile_with_cookies.downloads_path,
 		)
 
diff --git a/tests/ci/test_browser_session_downloads.py b/tests/ci/test_browser_session_downloads.py
index a4fcf90b7..38125cac8 100644
--- a/tests/ci/test_browser_session_downloads.py
+++ b/tests/ci/test_browser_session_downloads.py
@@ -6,6 +6,7 @@ import time
 import pytest
 
 from browser_use.browser import BrowserSession
+from browser_use.browser.profile import BrowserProfile
 
 
 @pytest.fixture(scope='function')
@@ -37,9 +38,11 @@ async def test_download_detection_timing(test_server, tmp_path):
 
 	# Test 1: With downloads_dir set (default behavior)
 	browser_with_downloads = BrowserSession(
-		headless=True,
-		downloads_dir=str(tmp_path / 'downloads'),
-		user_data_dir=None,
+		browser_profile=BrowserProfile(
+			headless=True,
+			downloads_path=str(tmp_path / 'downloads'),
+			user_data_dir=None,
+		)
 	)
 
 	await browser_with_downloads.start()
@@ -72,9 +75,11 @@ async def test_download_detection_timing(test_server, tmp_path):
 
 	# Test 2: With downloads_dir set to empty string (disables download detection)
 	browser_no_downloads = BrowserSession(
-		headless=True,
-		downloads_dir=None,
-		user_data_dir=None,
+		browser_profile=BrowserProfile(
+			headless=True,
+			downloads_path=None,
+			user_data_dir=None,
+		)
 	)
 
 	await browser_no_downloads.start()
@@ -124,9 +129,11 @@ async def test_actual_download_detection(test_server, tmp_path):
 	downloads_path.mkdir()
 
 	browser_session = BrowserSession(
-		headless=True,
-		downloads_path=str(downloads_path),
-		user_data_dir=None,
+		browser_profile=BrowserProfile(
+			headless=True,
+			downloads_path=str(downloads_path),
+			user_data_dir=None,
+		)
 	)
 
 	await browser_session.start()
diff --git a/tests/ci/test_browser_session_element_cache.py b/tests/ci/test_browser_session_element_cache.py
index d0201f958..2e26f7bdb 100644
--- a/tests/ci/test_browser_session_element_cache.py
+++ b/tests/ci/test_browser_session_element_cache.py
@@ -3,11 +3,10 @@ Systematic debugging of the selector map issue.
 Test each assumption step by step to isolate the problem.
 """
 
-import os
-
 import pytest
 
 from browser_use.browser import BrowserSession
+from browser_use.browser.profile import BrowserProfile
 from browser_use.controller.service import Controller
 
 
@@ -62,9 +61,10 @@ def httpserver(make_httpserver):
 async def browser_session():
 	"""Create a real browser session for testing."""
 	session = BrowserSession(
-		executable_path=os.getenv('BROWSER_PATH'),
-		user_data_dir=None,  # Use temporary profile
-		headless=True,
+		browser_profile=BrowserProfile(
+			user_data_dir=None,  # Use temporary profile
+			headless=True,
+		)
 	)
 	async with session:
 		yield session
@@ -356,7 +356,9 @@ async def test_assumption_9_pydantic_private_attrs(browser_session, controller,
 	# Check the browser_session that comes out of the model
 	extracted_browser_session = special_params.browser_session
 	print(f'5. Extracted browser_session ID: {id(extracted_browser_session)}')
-	print(f'6. Extracted browser_session cache: {extracted_browser_session._cached_browser_state_summary is not None}')
+	print(
+		f'6. Extracted browser_session cache: {extracted_browser_session._cached_browser_state_summary is not None if extracted_browser_session else False}'
+	)
 
 	# Check if they're the same object
 	if id(browser_session) == id(extracted_browser_session):
@@ -367,10 +369,10 @@ async def test_assumption_9_pydantic_private_attrs(browser_session, controller,
 		# Check if private attributes were preserved
 		print(f'7. Original has _cached_browser_state_summary attr: {hasattr(browser_session, "_cached_browser_state_summary")}')
 		print(
-			f'8. Extracted has _cached_browser_state_summary attr: {hasattr(extracted_browser_session, "_cached_browser_state_summary")}'
+			f'8. Extracted has _cached_browser_state_summary attr: {hasattr(extracted_browser_session, "_cached_browser_state_summary") if extracted_browser_session else False}'
 		)
 
-		if hasattr(extracted_browser_session, '_cached_browser_state_summary'):
+		if extracted_browser_session and hasattr(extracted_browser_session, '_cached_browser_state_summary'):
 			print(f'9. Extracted _cached_browser_state_summary value: {extracted_browser_session._cached_browser_state_summary}')
 
 
@@ -401,8 +403,8 @@ async def test_assumption_7_cache_gets_cleared(browser_session, controller, http
 		from browser_use import ActionResult
 
 		cache_exists = browser_session._cached_browser_state_summary is not None
-		if cache_exists:
-			cache_size = len(browser_session._cached_browser_state_summary.selector_map)
+		if cache_exists and browser_session._cached_browser_state_summary:
+			cache_size = len(browser_session._cached_browser_state_summary.selector_map)  # type: ignore
 		else:
 			cache_size = 0
 		return ActionResult(
@@ -415,8 +417,8 @@ async def test_assumption_7_cache_gets_cleared(browser_session, controller, http
 		from browser_use import ActionResult
 
 		cache_exists = browser_session._cached_browser_state_summary is not None
-		if cache_exists:
-			cache_size = len(browser_session._cached_browser_state_summary.selector_map)
+		if cache_exists and browser_session._cached_browser_state_summary:
+			cache_size = len(browser_session._cached_browser_state_summary.selector_map)  # type: ignore
 		else:
 			cache_size = 0
 		return ActionResult(
diff --git a/tests/ci/test_browser_session_file_uploads.py b/tests/ci/test_browser_session_file_uploads.py
index dfafcf329..4e6e2493f 100644
--- a/tests/ci/test_browser_session_file_uploads.py
+++ b/tests/ci/test_browser_session_file_uploads.py
@@ -11,6 +11,7 @@ Tests cover common real-world file upload patterns:
 import pytest
 from pytest_httpserver import HTTPServer
 
+from browser_use.browser.profile import BrowserProfile
 from browser_use.browser.session import BrowserSession
 
 
@@ -20,7 +21,7 @@ class TestBrowserSessionFileUploads:
 	@pytest.fixture
 	async def browser_session(self):
 		"""Create a BrowserSession instance for testing."""
-		session = BrowserSession(headless=True, user_data_dir=None, keep_alive=True)
+		session = BrowserSession(browser_profile=BrowserProfile(headless=True, user_data_dir=None, keep_alive=True))
 		yield session
 		await session.kill()
 
diff --git a/tests/ci/test_browser_session_output_paths.py b/tests/ci/test_browser_session_output_paths.py
index e6113f921..c2854e711 100644
--- a/tests/ci/test_browser_session_output_paths.py
+++ b/tests/ci/test_browser_session_output_paths.py
@@ -10,7 +10,7 @@ import pytest
 
 from browser_use import Agent, AgentHistoryList
 from browser_use.browser import BrowserProfile, BrowserSession
-from tests.ci.mocks import create_mock_llm
+from tests.ci.conftest import create_mock_llm
 
 
 @pytest.fixture
@@ -194,6 +194,7 @@ class TestAgentRecordings:
 				for gif in gif_files:
 					gif.unlink()
 			else:  # custom_path
+				assert expected_gif_path is not None, 'expected_gif_path should be set for custom_path'
 				assert expected_gif_path.exists(), f'GIF was not created at {expected_gif_path}'
 		finally:
 			await browser_session.stop()
@@ -216,10 +217,10 @@ class TestBrowserProfileRecordings:
 		video_dir = test_dir / f'videos_{context_type}_{alias}'
 		user_data_dir = None if context_type == 'incognito' else str(test_dir / 'user_data')
 
+		# Create profile with dynamic alias
+		profile_kwargs = {'headless': True, 'disable_security': True, 'user_data_dir': user_data_dir, alias: str(video_dir)}
 		browser_session = BrowserSession(
-			browser_profile=BrowserProfile(
-				headless=True, disable_security=True, user_data_dir=user_data_dir, **{alias: str(video_dir)}
-			)
+			browser_profile=BrowserProfile(**profile_kwargs)  # type: ignore
 		)
 		await browser_session.start()
 		try:
@@ -258,7 +259,10 @@ class TestBrowserProfileRecordings:
 
 		browser_session = BrowserSession(
 			browser_profile=BrowserProfile(
-				headless=True, disable_security=True, user_data_dir=user_data_dir, **{alias: str(har_path)}
+				headless=True,
+				disable_security=True,
+				user_data_dir=user_data_dir,
+				**{alias: str(har_path)},  # type: ignore
 			)
 		)
 		await browser_session.start()
@@ -307,7 +311,7 @@ class TestBrowserProfileRecordings:
 		if alias == 'trace_path':
 			browser_session.browser_profile.traces_dir = str(trace_dir)
 		else:
-			setattr(browser_session.browser_profile, alias, str(trace_dir))
+			setattr(browser_session.browser_profile, alias, str(trace_dir))  # type: ignore
 
 		await browser_session.start()
 		try:
diff --git a/tests/ci/test_browser_session_start.py b/tests/ci/test_browser_session_start.py
index bec2a5192..92da57f5f 100644
--- a/tests/ci/test_browser_session_start.py
+++ b/tests/ci/test_browser_session_start.py
@@ -12,18 +12,19 @@ Tests cover:
 import asyncio
 import json
 import logging
+import tempfile
 from pathlib import Path
 
 import pytest
 
 from browser_use.browser.profile import (
-	BROWSERUSE_CHROMIUM_USER_DATA_DIR,
 	BROWSERUSE_DEFAULT_CHANNEL,
 	BrowserChannel,
 	BrowserProfile,
 )
 from browser_use.browser.session import BrowserSession
-from tests.ci.mocks import create_mock_llm
+from browser_use.config import CONFIG
+from tests.ci.conftest import create_mock_llm
 
 # Set up test logging
 logger = logging.getLogger('browser_session_start_tests')
@@ -485,21 +486,30 @@ class TestBrowserSessionStart:
 			await session.stop()
 			# Browser should still be connected
 			assert session.initialized is True
-			assert session.browser is not None
-			assert session.browser.is_connected()
+			assert session.browser_context and session.browser_context.pages[0]
 
 		finally:
 			await session.kill()
 
 	async def test_user_data_dir_not_allowed_to_corrupt_default_profile(self, caplog):
 		"""Test user_data_dir handling for different browser channels and version mismatches."""
+		import logging
+
+		# Temporarily enable propagation for browser_use logger to capture logs
+		browser_use_logger = logging.getLogger('browser_use')
+		original_propagate = browser_use_logger.propagate
+		browser_use_logger.propagate = True
+
+		caplog.set_level(logging.WARNING, logger='browser_use.utils')
 
 		# Test 1: Chromium with default user_data_dir and default channel should work fine
 		session = BrowserSession(
-			headless=True,
-			user_data_dir=BROWSERUSE_CHROMIUM_USER_DATA_DIR,
-			channel=BROWSERUSE_DEFAULT_CHANNEL,  # chromium
-			keep_alive=False,
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir=CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR,
+				channel=BROWSERUSE_DEFAULT_CHANNEL,  # chromium
+				keep_alive=False,
+			),
 		)
 
 		try:
@@ -507,21 +517,21 @@ class TestBrowserSessionStart:
 			assert session.initialized is True
 			assert session.browser_context is not None
 			# Verify the user_data_dir wasn't changed
-			assert session.browser_profile.user_data_dir == BROWSERUSE_CHROMIUM_USER_DATA_DIR
+			assert session.browser_profile.user_data_dir == CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR
 		finally:
 			await session.kill()
 
 		# Test 2: Chrome with default user_data_dir should show warning and change dir
 		profile2 = BrowserProfile(
 			headless=True,
-			user_data_dir=BROWSERUSE_CHROMIUM_USER_DATA_DIR,
+			user_data_dir=CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR,
 			channel=BrowserChannel.CHROME,
 			keep_alive=False,
 		)
 
 		# The validator should have changed the user_data_dir
-		assert profile2.user_data_dir != BROWSERUSE_CHROMIUM_USER_DATA_DIR
-		assert profile2.user_data_dir == BROWSERUSE_CHROMIUM_USER_DATA_DIR.parent / 'default-chrome'
+		assert profile2.user_data_dir != CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR
+		assert profile2.user_data_dir == CONFIG.BROWSER_USE_DEFAULT_USER_DATA_DIR.parent / 'default-chrome'
 
 		# Check warning was logged
 		warning_found = any(
@@ -529,6 +539,9 @@ class TestBrowserSessionStart:
 		)
 		assert warning_found, 'Expected warning about changing user_data_dir was not found'
 
+		# Restore original propagate setting
+		browser_use_logger.propagate = original_propagate
+
 	# only run if `/Applications/Brave Browser.app` is installed
 	@pytest.mark.skipif(
 		not Path('~/.config/browseruse/profiles/stealth').expanduser().exists(), reason='Brave Browser not installed'
@@ -546,9 +559,11 @@ class TestBrowserSessionStart:
 		# await brave_session.stop()
 
 		chromium_session = BrowserSession(
-			headless=True,
-			user_data_dir='~/.config/browseruse/profiles/stealth',
-			channel=BrowserChannel.CHROMIUM,  # should crash when opened with chromium
+			browser_profile=BrowserProfile(
+				headless=True,
+				user_data_dir='~/.config/browseruse/profiles/stealth',
+				channel=BrowserChannel.CHROMIUM,  # should crash when opened with chromium
+			),
 		)
 
 		# open chrome with corrupted user_data_dir
@@ -559,53 +574,6 @@ class TestBrowserSessionStart:
 class TestBrowserSessionReusePatterns:
 	"""Tests for all browser re-use patterns documented in docs/customize/real-browser.mdx"""
 
-	@pytest.fixture(scope='module')
-	def mock_llm(self):
-		"""Mock LLM for agent tests"""
-		from unittest.mock import MagicMock
-
-		from langchain_core.language_models.chat_models import BaseChatModel
-
-		# Create a MagicMock that supports dictionary-style access
-		mock = MagicMock(spec=BaseChatModel)
-
-		# Skip verification by setting these attributes
-		mock._verified_api_keys = True
-		mock._verified_tool_calling_method = 'raw'
-		mock.model_name = 'mock-llm'
-
-		# Mock the invoke method to return a proper response
-		def mock_invoke(*args, **kwargs):
-			response = MagicMock()
-			# Return a valid JSON response that completes the task
-			response.content = """
-			{
-				"thinking": "null",
-				"evaluation_previous_goal": "Starting the task",
-				"memory": "Task started",
-				"next_goal": "Complete the task",
-				"action": [
-					{
-						"done": {
-							"text": "Task completed successfully",
-							"success": true
-						}
-					}
-				]
-			}
-			"""
-			return response
-
-		mock.invoke = mock_invoke
-
-		# Create an async version of the mock_invoke
-		async def mock_ainvoke(*args, **kwargs):
-			return mock_invoke(*args, **kwargs)
-
-		mock.ainvoke = mock_ainvoke
-
-		return mock
-
 	async def test_sequential_agents_same_profile_different_browser(self, mock_llm):
 		"""Test Sequential Agents, Same Profile, Different Browser pattern"""
 		from browser_use import Agent
@@ -652,9 +620,11 @@ class TestBrowserSessionReusePatterns:
 
 		# Create a reusable session with keep_alive
 		reused_session = BrowserSession(
-			user_data_dir=None,  # Use temp dir for testing
-			headless=True,
-			keep_alive=True,  # Don't close browser after agent.run()
+			browser_profile=BrowserProfile(
+				user_data_dir=None,  # Use temp dir for testing
+				headless=True,
+				keep_alive=True,  # Don't close browser after agent.run()
+			),
 		)
 
 		try:
@@ -695,7 +665,6 @@ class TestBrowserSessionReusePatterns:
 
 	async def test_parallel_agents_same_browser_multiple_tabs(self, httpserver):
 		"""Test Parallel Agents, Same Browser, Multiple Tabs pattern"""
-		import tempfile
 
 		from browser_use import Agent, BrowserSession
 
@@ -711,10 +680,12 @@ class TestBrowserSessionReusePatterns:
 		storage_state_path = Path(storage_state_path)
 
 		shared_browser = BrowserSession(
-			storage_state=storage_state_path,
-			user_data_dir=None,
-			keep_alive=True,
-			headless=True,
+			browser_profile=BrowserProfile(
+				storage_state=storage_state_path,
+				user_data_dir=None,
+				keep_alive=True,
+				headless=True,
+			),
 		)
 
 		try:
@@ -792,7 +763,7 @@ class TestBrowserSessionReusePatterns:
 			)
 
 			# Run all agents in parallel
-			results = await asyncio.gather(agent1.run(), agent2.run(), agent3.run())
+			_results = await asyncio.gather(agent1.run(), agent2.run(), agent3.run())
 
 			# Verify all agents used the same browser session (using __eq__ to check browser_pid, cdp_url, wss_url)
 			# Debug: print the browser sessions to see what's different
@@ -826,9 +797,11 @@ class TestBrowserSessionReusePatterns:
 
 		# Create a browser session and start it first
 		shared_browser = BrowserSession(
-			user_data_dir=None,
-			headless=True,
-			keep_alive=True,  # Keep the browser alive for reuse
+			browser_profile=BrowserProfile(
+				user_data_dir=None,
+				headless=True,
+				keep_alive=True,  # Keep the browser alive for reuse
+			),
 		)
 
 		try:
@@ -857,7 +830,7 @@ class TestBrowserSessionReusePatterns:
 			await page.goto(httpserver.url_for('/'), wait_until='domcontentloaded')
 
 			# Run agents in parallel (may interfere with each other)
-			results = await asyncio.gather(agent1.run(), agent2.run(), return_exceptions=True)
+			_results = await asyncio.gather(agent1.run(), agent2.run(), return_exceptions=True)
 
 			# Verify both agents used the same browser session
 			assert agent1.browser_session == agent2.browser_session
@@ -869,7 +842,6 @@ class TestBrowserSessionReusePatterns:
 
 	async def test_parallel_agents_same_profile_different_browsers(self, mock_llm):
 		"""Test Parallel Agents, Same Profile, Different Browsers pattern (recommended)"""
-		import tempfile
 
 		from browser_use import Agent
 		from browser_use.browser import BrowserProfile, BrowserSession
@@ -907,7 +879,7 @@ class TestBrowserSessionReusePatterns:
 			)
 
 			# Run agents in parallel
-			results = await asyncio.gather(agent1.run(), agent2.run())
+			_results = await asyncio.gather(agent1.run(), agent2.run())
 
 			# Verify different browser sessions were used
 			assert agent1.browser_session is not agent2.browser_session
@@ -933,3 +905,121 @@ class TestBrowserSessionReusePatterns:
 			await window1.kill()
 			await window2.kill()
 			auth_json_path.unlink(missing_ok=True)
+
+	async def test_browser_shutdown_isolated(self):
+		"""Test that browser shutdown doesnt affect other browser_sessions"""
+		from browser_use import BrowserSession
+
+		browser_session1 = BrowserSession(
+			browser_profile=BrowserProfile(
+				user_data_dir=None,
+				headless=True,
+				keep_alive=True,  # Keep the browser alive for reuse
+			),
+		)
+		browser_session2 = BrowserSession(
+			browser_profile=BrowserProfile(
+				user_data_dir=None,
+				headless=True,
+				keep_alive=True,  # Keep the browser alive for reuse
+			),
+		)
+		await browser_session1.start()
+		await browser_session2.start()
+
+		assert browser_session1.is_connected()
+		assert browser_session2.is_connected()
+		assert browser_session1.browser_context != browser_session2.browser_context
+
+		await browser_session1.create_new_tab('chrome://version')
+		await browser_session2.create_new_tab('chrome://settings')
+
+		await browser_session2.kill()
+
+		# ensure that the browser_session1 is still connected and unaffected by the kill of browser_session2
+		assert browser_session1.is_connected()
+		assert browser_session1.browser_context is not None
+		await browser_session1.create_new_tab('chrome://settings')
+		await browser_session1.browser_context.pages[0].evaluate('alert(1)')
+
+		await browser_session1.kill()
+
+	async def test_many_parallel_browser_sessions(self):
+		"""Test spawning 20 parallel browser_sessions with different settings and ensure they all work"""
+		from browser_use import BrowserSession
+
+		browser_sessions = []
+
+		for i in range(5):
+			browser_sessions.append(
+				BrowserSession(
+					browser_profile=BrowserProfile(
+						user_data_dir=None,
+						headless=True,
+						keep_alive=True,
+					),
+				)
+			)
+		for i in range(5):
+			browser_sessions.append(
+				BrowserSession(
+					browser_profile=BrowserProfile(
+						user_data_dir=Path(tempfile.mkdtemp(prefix=f'browseruse-tmp-{i}')),
+						headless=True,
+						keep_alive=True,
+					),
+				)
+			)
+		for i in range(5):
+			browser_sessions.append(
+				BrowserSession(
+					browser_profile=BrowserProfile(
+						user_data_dir=None,
+						headless=True,
+						keep_alive=False,
+					),
+				)
+			)
+		for i in range(5):
+			browser_sessions.append(
+				BrowserSession(
+					browser_profile=BrowserProfile(
+						user_data_dir=Path(tempfile.mkdtemp(prefix=f'browseruse-tmp-{i}')),
+						headless=True,
+						keep_alive=False,
+					),
+				)
+			)
+
+		await asyncio.gather(*[browser_session.start() for browser_session in browser_sessions])
+
+		# ensure all are connected and usable
+		new_tab_tasks = []
+		for browser_session in browser_sessions:
+			assert await browser_session.is_connected()
+			assert browser_session.browser_context is not None
+			new_tab_tasks.append(browser_session.create_new_tab('chrome://version'))
+		await asyncio.gather(*new_tab_tasks)
+
+		# kill every 3rd browser_session
+		kill_tasks = []
+		for i in range(0, len(browser_sessions), 3):
+			kill_tasks.append(browser_sessions[i].kill())
+			browser_sessions[i] = None
+		await asyncio.gather(*kill_tasks)
+
+		# ensure the remaining browser_sessions are still connected and usable
+		new_tab_tasks = []
+		screenshot_tasks = []
+		for browser_session in filter(bool, browser_sessions):
+			assert await browser_session.is_connected()
+			assert browser_session.browser_context is not None
+			new_tab_tasks.append(browser_session.create_new_tab('chrome://version'))
+			screenshot_tasks.append(browser_session.take_screenshot())
+		await asyncio.gather(*new_tab_tasks)
+		await asyncio.gather(*screenshot_tasks)
+
+		kill_tasks = []
+		for browser_session in filter(bool, browser_sessions):
+			kill_tasks.append(browser_session.kill())
+		await asyncio.gather(*kill_tasks)
diff --git a/tests/ci/test_browser_session_tab_management.py b/tests/ci/test_browser_session_tab_management.py
index b721e89af..40e0d20dc 100644
--- a/tests/ci/test_browser_session_tab_management.py
+++ b/tests/ci/test_browser_session_tab_management.py
@@ -8,6 +8,7 @@ from pytest_httpserver import HTTPServer
 load_dotenv()
 
 from browser_use.agent.views import ActionModel
+from browser_use.browser.profile import BrowserProfile
 from browser_use.browser.session import BrowserSession
 from browser_use.controller.service import Controller
 
@@ -50,9 +51,11 @@ def base_url(http_server):
 async def browser_session(base_url):
 	"""Create and provide a BrowserSession instance with a properly initialized tab."""
 	browser_session = BrowserSession(
-		user_data_dir=None,
-		headless=True,
-		keep_alive=True,
+		browser_profile=BrowserProfile(
+			user_data_dir=None,
+			headless=True,
+			keep_alive=True,
+		)
 	)
 	await browser_session.start()
 
@@ -110,8 +113,9 @@ class TestTabManagement:
 		browser_session.agent_current_page = None
 
 		# close all existing tabs
-		for page in browser_session.browser_context.pages:
-			await page.close()
+		if browser_session.browser_context:
+			for page in browser_session.browser_context.pages:  # type: ignore
+				await page.close()
 
 		await asyncio.sleep(0.5)
 
@@ -327,22 +331,22 @@ class TestTabManagement:
 		assert browser_session.browser_context is not None
 		assert browser_session.browser_context != original_context
 		assert browser_session.initialized is True
-		assert browser_session.is_connected() is True
+		assert (await browser_session.is_connected()) is True
 
 	async def test_concurrent_context_access_during_closure(self, browser_session):
 		"""Test concurrent access to browser context during closure"""
 		# logger.info('Testing concurrent context access during closure')
 
 		await browser_session.start()
-		assert browser_session.is_connected() is True
+		assert (await browser_session.is_connected()) is True
 
 		# Create a barrier to synchronize operations
 		barrier = asyncio.Barrier(3)
 
 		async def close_context():
 			await barrier.wait()
-			await browser_session.browser_context.browser.close()
-			assert browser_session.is_connected() is False
+			await browser_session.browser_context.close()
+			assert (await browser_session.is_connected()) is False
 			return 'closed'
 
 		async def access_pages():
@@ -356,14 +360,14 @@ class TestTabManagement:
 		async def check_connection():
 			await barrier.wait()
 			await asyncio.sleep(0.01)  # Small delay to let close start
-			connected = browser_session.is_connected()
+			connected = await browser_session.is_connected()
 			return f'connected: {connected}'
 
 		# Run all operations concurrently
-		results = await asyncio.gather(close_context(), access_pages(), check_connection(), return_exceptions=True)
+		results = list(await asyncio.gather(close_context(), access_pages(), check_connection(), return_exceptions=True))
 
 		# All operations should complete without crashes
-		assert all(not isinstance(r, Exception) for r in results)
+		assert results and all(not isinstance(r, Exception) for r in results)
 		assert 'closed' in results
 
 		await browser_session.kill()
diff --git a/tests/ci/test_browser_session_via_cdp.py b/tests/ci/test_browser_session_via_cdp.py
index 81d4ffa7f..f133a99d1 100644
--- a/tests/ci/test_browser_session_via_cdp.py
+++ b/tests/ci/test_browser_session_via_cdp.py
@@ -1,14 +1,17 @@
 import pytest
 
 from browser_use.browser import BrowserSession
+from browser_use.browser.profile import BrowserProfile
 from browser_use.browser.types import async_playwright
 
 
 async def test_connection_via_cdp():
 	browser_session = BrowserSession(
 		cdp_url='http://localhost:9898',
-		headless=True,
-		keep_alive=True,
+		browser_profile=BrowserProfile(
+			headless=True,
+			keep_alive=True,
+		),
 	)
 	with pytest.raises(Exception) as e:
 		await browser_session.start()
diff --git a/tests/ci/test_browser_session_viewport_and_proxy.py b/tests/ci/test_browser_session_viewport_and_proxy.py
index 8a99ba6fe..bdbfc9690 100644
--- a/tests/ci/test_browser_session_viewport_and_proxy.py
+++ b/tests/ci/test_browser_session_viewport_and_proxy.py
@@ -1,5 +1,5 @@
 from browser_use.browser import BrowserSession
-from browser_use.browser.profile import ProxySettings
+from browser_use.browser.profile import BrowserProfile, ProxySettings
 
 
 async def test_proxy_settings_pydantic_model():
@@ -33,11 +33,13 @@ async def test_window_size_with_real_browser():
 	"""
 	# Create browser profile with headless mode and specific dimensions
 	browser_session = BrowserSession(
-		user_data_dir=None,
-		headless=True,  # window size gets converted to viewport size in headless mode
-		window_size={'width': 999, 'height': 888},
-		maximum_wait_page_load_time=2.0,
-		minimum_wait_page_load_time=0.2,
+		browser_profile=BrowserProfile(
+			user_data_dir=None,
+			headless=True,  # window size gets converted to viewport size in headless mode
+			window_size={'width': 999, 'height': 888},
+			maximum_wait_page_load_time=2.0,
+			minimum_wait_page_load_time=0.2,
+		)
 	)
 	await browser_session.start()
 	page = await browser_session.get_current_page()
@@ -117,9 +119,11 @@ async def test_proxy_with_real_browser():
 
 	# Create browser session
 	browser_session = BrowserSession(
-		headless=True,
-		proxy=proxy_settings,
-		user_data_dir=None,
+		browser_profile=BrowserProfile(
+			headless=True,
+			proxy=proxy_settings,
+			user_data_dir=None,
+		)
 	)
 	await browser_session.start()
 	# Success - the browser was initialized with our proxy settings
diff --git a/tests/ci/test_config.py b/tests/ci/test_config.py
new file mode 100644
index 000000000..4d8f3be11
--- /dev/null
+++ b/tests/ci/test_config.py
@@ -0,0 +1,120 @@
+"""Tests for lazy loading configuration system."""
+
+import os
+
+from browser_use.config import CONFIG
+
+
+class TestLazyConfig:
+	"""Test lazy loading of environment variables through CONFIG object."""
+
+	def test_config_reads_env_vars_lazily(self):
+		"""Test that CONFIG reads environment variables each time they're accessed."""
+		# Set an env var
+		original_value = os.environ.get('BROWSER_USE_LOGGING_LEVEL', '')
+		try:
+			os.environ['BROWSER_USE_LOGGING_LEVEL'] = 'debug'
+			assert CONFIG.BROWSER_USE_LOGGING_LEVEL == 'debug'
+
+			# Change the env var
+			os.environ['BROWSER_USE_LOGGING_LEVEL'] = 'info'
+			assert CONFIG.BROWSER_USE_LOGGING_LEVEL == 'info'
+
+			# Delete the env var to test default
+			del os.environ['BROWSER_USE_LOGGING_LEVEL']
+			assert CONFIG.BROWSER_USE_LOGGING_LEVEL == 'info'  # default value
+		finally:
+			# Restore original value
+			if original_value:
+				os.environ['BROWSER_USE_LOGGING_LEVEL'] = original_value
+			else:
+				os.environ.pop('BROWSER_USE_LOGGING_LEVEL', None)
+
+	def test_boolean_env_vars(self):
+		"""Test boolean environment variables are parsed correctly."""
+		original_value = os.environ.get('ANONYMIZED_TELEMETRY', '')
+		try:
+			# Test true values
+			for true_val in ['true', 'True', 'TRUE', 'yes', 'Yes', '1']:
+				os.environ['ANONYMIZED_TELEMETRY'] = true_val
+				assert CONFIG.ANONYMIZED_TELEMETRY is True, f'Failed for value: {true_val}'
+
+			# Test false values
+			for false_val in ['false', 'False', 'FALSE', 'no', 'No', '0']:
+				os.environ['ANONYMIZED_TELEMETRY'] = false_val
+				assert CONFIG.ANONYMIZED_TELEMETRY is False, f'Failed for value: {false_val}'
+		finally:
+			if original_value:
+				os.environ['ANONYMIZED_TELEMETRY'] = original_value
+			else:
+				os.environ.pop('ANONYMIZED_TELEMETRY', None)
+
+	def test_api_keys_lazy_loading(self):
+		"""Test API keys are loaded lazily."""
+		original_value = os.environ.get('OPENAI_API_KEY', '')
+		try:
+			# Test empty default
+			os.environ.pop('OPENAI_API_KEY', None)
+			assert CONFIG.OPENAI_API_KEY == ''
+
+			# Set a value
+			os.environ['OPENAI_API_KEY'] = 'test-key-123'
+			assert CONFIG.OPENAI_API_KEY == 'test-key-123'
+
+			# Change the value
+			os.environ['OPENAI_API_KEY'] = 'new-key-456'
+			assert CONFIG.OPENAI_API_KEY == 'new-key-456'
+		finally:
+			if original_value:
+				os.environ['OPENAI_API_KEY'] = original_value
+			else:
+				os.environ.pop('OPENAI_API_KEY', None)
+
+	def test_path_configuration(self):
+		"""Test path configuration variables."""
+		original_value = os.environ.get('XDG_CACHE_HOME', '')
+		try:
+			# Test custom path
+			test_path = '/tmp/test-cache'
+			os.environ['XDG_CACHE_HOME'] = test_path
+			# Use Path().resolve() to handle symlinks (e.g., /tmp -> /private/tmp on macOS)
+			from pathlib import Path
+
+			assert CONFIG.XDG_CACHE_HOME == Path(test_path).resolve()
+
+			# Test default path expansion
+			os.environ.pop('XDG_CACHE_HOME', None)
+			assert '/.cache' in str(CONFIG.XDG_CACHE_HOME)
+		finally:
+			if original_value:
+				os.environ['XDG_CACHE_HOME'] = original_value
+			else:
+				os.environ.pop('XDG_CACHE_HOME', None)
+
+	def test_cloud_sync_inherits_telemetry(self):
+		"""Test BROWSER_USE_CLOUD_SYNC inherits from ANONYMIZED_TELEMETRY when not set."""
+		telemetry_original = os.environ.get('ANONYMIZED_TELEMETRY', '')
+		sync_original = os.environ.get('BROWSER_USE_CLOUD_SYNC', '')
+		try:
+			# When BROWSER_USE_CLOUD_SYNC is not set, it should inherit from ANONYMIZED_TELEMETRY
+			os.environ['ANONYMIZED_TELEMETRY'] = 'true'
+			os.environ.pop('BROWSER_USE_CLOUD_SYNC', None)
+			assert CONFIG.BROWSER_USE_CLOUD_SYNC is True
+
+			os.environ['ANONYMIZED_TELEMETRY'] = 'false'
+			os.environ.pop('BROWSER_USE_CLOUD_SYNC', None)
+			assert CONFIG.BROWSER_USE_CLOUD_SYNC is False
+
+			# When explicitly set, it should use its own value
+			os.environ['ANONYMIZED_TELEMETRY'] = 'false'
+			os.environ['BROWSER_USE_CLOUD_SYNC'] = 'true'
+			assert CONFIG.BROWSER_USE_CLOUD_SYNC is True
+		finally:
+			if telemetry_original:
+				os.environ['ANONYMIZED_TELEMETRY'] = telemetry_original
+			else:
+				os.environ.pop('ANONYMIZED_TELEMETRY', None)
+			if sync_original:
+				os.environ['BROWSER_USE_CLOUD_SYNC'] = sync_original
+			else:
+				os.environ.pop('BROWSER_USE_CLOUD_SYNC', None)
diff --git a/tests/ci/test_controller.py b/tests/ci/test_controller.py
index d9af6ef4b..605df59a7 100644
--- a/tests/ci/test_controller.py
+++ b/tests/ci/test_controller.py
@@ -8,6 +8,7 @@ from pytest_httpserver import HTTPServer
 
 from browser_use.agent.views import ActionModel, ActionResult
 from browser_use.browser import BrowserSession
+from browser_use.browser.profile import BrowserProfile
 from browser_use.controller.service import Controller
 from browser_use.controller.views import (
 	ClickElementAction,
@@ -79,9 +80,10 @@ def base_url(http_server):
 async def browser_session():
 	"""Create and provide a Browser instance with security disabled."""
 	browser_session = BrowserSession(
-		# browser_profile=BrowserProfile(),
-		headless=True,
-		user_data_dir=None,
+		browser_profile=BrowserProfile(
+			headless=True,
+			user_data_dir=None,
+		)
 	)
 	await browser_session.start()
 	yield browser_session
@@ -113,6 +115,7 @@ class TestControllerIntegration:
 
 		# Verify the result
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert f'Navigated to {base_url}/page1' in result.extracted_content
 
 		# Verify the current page URL
@@ -140,6 +143,7 @@ class TestControllerIntegration:
 
 		# Verify the result
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Scrolled down' in result.extracted_content
 
 		# Create scroll up action
@@ -153,6 +157,7 @@ class TestControllerIntegration:
 
 		# Verify the result
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Scrolled up' in result.extracted_content
 
 	async def test_registry_actions(self, controller, browser_session):
@@ -208,6 +213,7 @@ class TestControllerIntegration:
 
 		# Verify the result
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Custom action executed with: test_value on' in result.extracted_content
 		assert f'{base_url}/page1' in result.extracted_content
 
@@ -262,6 +268,7 @@ class TestControllerIntegration:
 			result = await controller.act(InputTextActionModel(**input_action), browser_session)
 			# If successful, verify the result
 			assert isinstance(result, ActionResult)
+			assert result.extracted_content is not None
 			assert 'Input' in result.extracted_content
 		except Exception as e:
 			# If it fails due to DOM issues, that's expected in a test environment
@@ -353,6 +360,7 @@ class TestControllerIntegration:
 
 		# Verify the result
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Navigated back' in result.extracted_content
 
 		# Add another delay to allow the navigation to complete
@@ -475,6 +483,7 @@ class TestControllerIntegration:
 
 		# Verify the result
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Searched for "Python web automation" in Google' in result.extracted_content
 
 		# For our test purposes, we just verify we're on some URL
@@ -508,6 +517,7 @@ class TestControllerIntegration:
 
 			# Verify the result
 			assert isinstance(result, ActionResult)
+			assert result.extracted_content is not None
 			assert success_done_message in result.extracted_content
 			assert result.success is True
 			assert result.is_done is True
@@ -523,6 +533,7 @@ class TestControllerIntegration:
 
 			# Verify the result
 			assert isinstance(result, ActionResult)
+			assert result.extracted_content is not None
 			assert failed_done_message in result.extracted_content
 			assert result.success is False
 			assert result.is_done is True
@@ -718,6 +729,10 @@ class TestControllerIntegration:
 		drag_action = {
 			'drag_drop': DragDropAction(
 				# Use the coordinate-based approach
+				element_source=None,
+				element_target=None,
+				element_source_offset=None,
+				element_target_offset=None,
 				coord_source_x=element_info['source']['x'],
 				coord_source_y=element_info['source']['y'],
 				coord_target_x=element_info['target']['x'],
@@ -736,6 +751,7 @@ class TestControllerIntegration:
 		# Step 5: Verify the controller action result
 		assert result.error is None, f'Drag operation failed with error: {result.error}'
 		assert result.is_done is False
+		assert result.extracted_content is not None
 		assert '🖱️ Dragged from' in result.extracted_content
 
 		# Step 6: Verify the element was moved by checking its new parent
@@ -827,7 +843,8 @@ class TestControllerIntegration:
 
 		# Verify navigation result
 		assert isinstance(goto_result, ActionResult)
-		assert f'Navigated to {base_url}/keyboard' in goto_result.extracted_content
+		assert goto_result.extracted_content is not None
+		assert goto_result.extracted_content is not None and f'Navigated to {base_url}/keyboard' in goto_result.extracted_content
 		assert goto_result.error is None
 		assert goto_result.is_done is False
 
@@ -853,7 +870,8 @@ class TestControllerIntegration:
 
 		# Verify Tab action result
 		assert isinstance(tab_result, ActionResult)
-		assert 'Sent keys: Tab' in tab_result.extracted_content
+		assert tab_result.extracted_content is not None
+		assert tab_result.extracted_content is not None and 'Sent keys: Tab' in tab_result.extracted_content
 		assert tab_result.error is None
 		assert tab_result.is_done is False
 
@@ -873,7 +891,8 @@ class TestControllerIntegration:
 
 		# Verify typing action result
 		assert isinstance(type_result, ActionResult)
-		assert f'Sent keys: {test_text}' in type_result.extracted_content
+		assert type_result.extracted_content is not None
+		assert type_result.extracted_content is not None and f'Sent keys: {test_text}' in type_result.extracted_content
 		assert type_result.error is None
 		assert type_result.is_done is False
 
@@ -894,7 +913,11 @@ class TestControllerIntegration:
 
 		# Verify select all action result
 		assert isinstance(select_all_result, ActionResult)
-		assert 'Sent keys: ControlOrMeta+a' in select_all_result.extracted_content
+		assert select_all_result.extracted_content is not None
+		assert (
+			select_all_result.extracted_content is not None
+			and 'Sent keys: ControlOrMeta+a' in select_all_result.extracted_content
+		)
 		assert select_all_result.error is None
 
 		# Verify selection length matches the text length
@@ -915,7 +938,8 @@ class TestControllerIntegration:
 
 		# Verify second Tab action result
 		assert isinstance(tab_result2, ActionResult)
-		assert 'Sent keys: Tab' in tab_result2.extracted_content
+		assert tab_result2.extracted_content is not None
+		assert tab_result2.extracted_content is not None and 'Sent keys: Tab' in tab_result2.extracted_content
 		assert tab_result2.error is None
 
 		# Verify we moved to the textarea
@@ -933,7 +957,10 @@ class TestControllerIntegration:
 
 		# Verify textarea typing action result
 		assert isinstance(textarea_result, ActionResult)
-		assert f'Sent keys: {textarea_text}' in textarea_result.extracted_content
+		assert textarea_result.extracted_content is not None
+		assert (
+			textarea_result.extracted_content is not None and f'Sent keys: {textarea_text}' in textarea_result.extracted_content
+		)
 		assert textarea_result.error is None
 		assert textarea_result.is_done is False
 
@@ -1038,6 +1065,7 @@ class TestControllerIntegration:
 		assert isinstance(result, ActionResult)
 
 		# Core logic validation: Verify all options are returned
+		assert result.extracted_content is not None
 		for option in expected_options[1:]:  # Skip the placeholder option
 			assert option['text'] in result.extracted_content, f"Option '{option['text']}' not found in result content"
 
@@ -1135,6 +1163,7 @@ class TestControllerIntegration:
 		assert isinstance(result, ActionResult)
 
 		# Core logic validation: Verify selection was successful
+		assert result.extracted_content is not None
 		assert 'selected option' in result.extracted_content.lower()
 		assert 'Second Option' in result.extracted_content
 
@@ -1223,26 +1252,32 @@ class TestControllerIntegration:
 		expected_result_text = 'Button 1 clicked'
 
 		# Verify the button text matches what we expect
-		assert expected_button_text in button_text, f"Expected button text '{expected_button_text}' not found in '{button_text}'"
+		assert button_text is not None and expected_button_text in button_text, (
+			f"Expected button text '{expected_button_text}' not found in '{button_text}'"
+		)
 
 		# Create a model for the click_element_by_index action
 		class ClickElementActionModel(ActionModel):
 			click_element_by_index: ClickElementAction | None = None
 
 		# Execute the action with the button index
-		result = await controller.act(ClickElementActionModel(click_element_by_index={'index': button_index}), browser_session)
+		result = await controller.act(
+			ClickElementActionModel(click_element_by_index=ClickElementAction(index=button_index)), browser_session
+		)
 
 		# Verify the result structure
 		assert isinstance(result, ActionResult), 'Result should be an ActionResult instance'
 		assert result.error is None, f'Expected no error but got: {result.error}'
 
 		# Core logic validation: Verify click was successful
+		assert result.extracted_content is not None
 		assert f'Clicked button with index {button_index}' in result.extracted_content, (
 			f'Expected click confirmation in result content, got: {result.extracted_content}'
 		)
-		assert button_text in result.extracted_content, (
-			f"Button text '{button_text}' not found in result content: {result.extracted_content}"
-		)
+		if button_text:
+			assert result.extracted_content is not None and button_text in result.extracted_content, (
+				f"Button text '{button_text}' not found in result content: {result.extracted_content}"
+			)
 
 		# Verify the click actually had an effect on the page
 		result_text = await page.evaluate("document.getElementById('result').textContent")
diff --git a/tests/ci/test_registry.py b/tests/ci/test_registry.py
index 46c9bbd2c..04fb24729 100644
--- a/tests/ci/test_registry.py
+++ b/tests/ci/test_registry.py
@@ -20,6 +20,7 @@ from pytest_httpserver.httpserver import HandlerType
 
 from browser_use.agent.views import ActionResult
 from browser_use.browser import BrowserSession
+from browser_use.browser.profile import BrowserProfile
 from browser_use.browser.types import Page
 from browser_use.controller.registry.service import Registry
 from browser_use.controller.registry.views import ActionModel as BaseActionModel
@@ -29,7 +30,7 @@ from browser_use.controller.views import (
 	NoParamsAction,
 	SearchGoogleAction,
 )
-from tests.ci.mocks import create_mock_llm
+from tests.ci.conftest import create_mock_llm
 
 # Configure logging
 logging.basicConfig(level=logging.DEBUG)
@@ -96,8 +97,10 @@ def registry():
 async def browser_session(base_url):
 	"""Create a real BrowserSession for testing"""
 	browser_session = BrowserSession(
-		headless=True,
-		user_data_dir=None,
+		browser_profile=BrowserProfile(
+			headless=True,
+			user_data_dir=None,
+		)
 	)
 	await browser_session.start()
 	await browser_session.create_new_tab(f'{base_url}/test')
@@ -119,6 +122,7 @@ class TestActionRegistryParameterPatterns:
 		result = await registry.execute_action('simple_action', {'text': 'hello', 'number': 42})
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Text: hello, Number: 42' in result.extracted_content
 
 	async def test_individual_parameters_with_browser(self, registry, browser_session, base_url):
@@ -136,6 +140,7 @@ class TestActionRegistryParameterPatterns:
 		result = await registry.execute_action('action_with_browser', {'text': 'hello'}, browser_session=browser_session)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Text: hello, URL:' in result.extracted_content
 		assert base_url in result.extracted_content
 
@@ -154,6 +159,7 @@ class TestActionRegistryParameterPatterns:
 		result = await registry.execute_action('action_with_page', {'text': 'hello'}, browser_session=browser_session)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Text: hello, Page Title: Test Page' in result.extracted_content
 
 	async def test_pydantic_model_with_page_parameter(self, registry, browser_session, base_url):
@@ -173,6 +179,7 @@ class TestActionRegistryParameterPatterns:
 		)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Text: test, Number: 100, Page Title: Test Page' in result.extracted_content
 
 	async def test_pydantic_model_parameters(self, registry, browser_session, base_url):
@@ -194,6 +201,7 @@ class TestActionRegistryParameterPatterns:
 		)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Text: test, Number: 100, Flag: True' in result.extracted_content
 		assert base_url in result.extracted_content
 
@@ -229,6 +237,7 @@ class TestActionRegistryParameterPatterns:
 		)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Text: hello' in result.extracted_content
 		assert base_url in result.extracted_content
 		# The mock LLM returns a JSON response
@@ -248,6 +257,7 @@ class TestActionRegistryParameterPatterns:
 		)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'No params action executed on' in result.extracted_content
 		assert '/test' in result.extracted_content
 
@@ -266,11 +276,13 @@ class TestActionRegistryParameterPatterns:
 
 		# Test legacy browser parameter
 		result1 = await registry.execute_action('legacy_browser_action', {'text': 'test1'}, browser_session=browser_session)
+		assert result1.extracted_content is not None
 		assert 'Legacy browser: test1, URL:' in result1.extracted_content
 		assert '/test' in result1.extracted_content
 
 		# Test legacy browser_context parameter
 		result2 = await registry.execute_action('legacy_context_action', {'text': 'test2'}, browser_session=browser_session)
+		assert result2.extracted_content is not None
 		assert 'Legacy context: test2, URL:' in result2.extracted_content
 		assert '/test' in result2.extracted_content
 
@@ -296,11 +308,13 @@ class TestActionRegistryParameterPatterns:
 
 		# Test direct page parameter
 		result1 = await registry.execute_action('direct_page_action', {'text': 'optimized'}, browser_session=browser_session)
+		assert result1.extracted_content is not None
 		assert 'Direct page: optimized, URL:' in result1.extracted_content
 		assert '/test' in result1.extracted_content
 
 		# Test browser_session parameter (should still work)
 		result2 = await registry.execute_action('browser_session_action', {'text': 'legacy'}, browser_session=browser_session)
+		assert result2.extracted_content is not None
 		assert 'Browser session: legacy, URL:' in result2.extracted_content
 		assert '/test' in result2.extracted_content
 
@@ -313,6 +327,7 @@ class TestActionRegistryParameterPatterns:
 			return ActionResult(extracted_content=f'Pydantic page: {params.message}, URL: {page.url}')
 
 		result3 = await registry.execute_action('pydantic_page_action', {'message': 'pydantic'}, browser_session=browser_session)
+		assert result3.extracted_content is not None
 		assert 'Pydantic page: pydantic, URL:' in result3.extracted_content
 		assert '/test' in result3.extracted_content
 
@@ -346,6 +361,7 @@ class TestActionToActionCalling:
 		result = await registry.execute_action('calling_action', {'message': 'test'}, browser_session=browser_session)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Called result: First: Helper processed: test on' in result.extracted_content
 		assert '/test' in result.extracted_content
 
@@ -373,13 +389,14 @@ class TestActionToActionCalling:
 			# Get the action's param model to call it properly
 			action = registry.registry.actions['select_cell_or_range_fixed']
 			params = action.param_model(cell_or_range=range_name)
-			await select_cell_or_range_fixed(params=params, browser_session=browser_session)
+			await select_cell_or_range_fixed(cell_or_range=range_name, browser_session=browser_session)
 			return ActionResult(extracted_content=f'Updated range {range_name} with {new_contents}')
 
 		# Test the fixed version (should work)
 		result_fixed = await registry.execute_action(
 			'select_cell_or_range_fixed', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
 		)
+		assert result_fixed.extracted_content is not None
 		assert 'Selected cell A1:F100 on' in result_fixed.extracted_content
 		assert '/test' in result_fixed.extracted_content
 
@@ -387,6 +404,7 @@ class TestActionToActionCalling:
 		result_chain = await registry.execute_action(
 			'update_range_contents', {'range_name': 'B2:D4', 'new_contents': 'test data'}, browser_session=browser_session
 		)
+		assert result_chain.extracted_content is not None
 		assert 'Updated range B2:D4 with test data' in result_chain.extracted_content
 
 		# Test the problematic version (should work with enhanced registry)
@@ -394,6 +412,7 @@ class TestActionToActionCalling:
 			'select_cell_or_range', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
 		)
 		# With the enhanced registry, this should succeed
+		assert result_problematic.extracted_content is not None
 		assert 'Selected cell A1:F100 on' in result_problematic.extracted_content
 		assert '/test' in result_problematic.extracted_content
 
@@ -425,6 +444,7 @@ class TestActionToActionCalling:
 		result = await registry.execute_action('top_action', {'original': 'test'}, browser_session=browser_session)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Top: Middle: Base: processed-enhanced-test on' in result.extracted_content
 		assert '/test' in result.extracted_content
 
@@ -444,11 +464,12 @@ class TestRegistryEdgeCases:
 		with pytest.raises(
 			TypeError, match='test_action\\(\\) does not accept positional arguments, only keyword arguments are allowed'
 		):
-			await test_action(browser_session, 'A1:B2')
+			await test_action('A1:B2', browser_session)
 
 		# Test that calling with keyword arguments works
 		result = await test_action(browser_session=browser_session, cell_or_range='A1:B2')
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Selected cell A1:B2 on' in result.extracted_content
 
 	async def test_missing_required_browser_session(self, registry):
@@ -520,6 +541,7 @@ class TestRegistryEdgeCases:
 		result = await registry.execute_action('sync_action', {'text': 'test'}, browser_session=browser_session)
 
 		assert isinstance(result, ActionResult)
+		assert result.extracted_content is not None
 		assert 'Sync: test' in result.extracted_content
 
 	async def test_excluded_actions(self, browser_session):
@@ -545,6 +567,7 @@ class TestRegistryEdgeCases:
 
 		# Included action should work
 		result = await registry_with_exclusions.execute_action('included_action', {'text': 'test'})
+		assert result.extracted_content is not None
 		assert 'Should execute: test' in result.extracted_content
 
 
@@ -568,14 +591,17 @@ class TestExistingControllerActions:
 
 		# Test SearchGoogleAction
 		result1 = await registry.execute_action('test_search', {'query': 'python testing'}, browser_session=browser_session)
+		assert result1.extracted_content is not None
 		assert 'Searched for: python testing' in result1.extracted_content
 
 		# Test ClickElementAction
 		result2 = await registry.execute_action('test_click', {'index': 42}, browser_session=browser_session)
+		assert result2.extracted_content is not None
 		assert 'Clicked element: 42' in result2.extracted_content
 
 		# Test InputTextAction
 		result3 = await registry.execute_action('test_input', {'index': 5, 'text': 'test input'}, browser_session=browser_session)
+		assert result3.extracted_content is not None
 		assert 'Input text: test input at index: 5' in result3.extracted_content
 
 	async def test_pydantic_vs_individual_params_consistency(self, registry, browser_session):
@@ -603,7 +629,9 @@ class TestExistingControllerActions:
 		result2 = await registry.execute_action('pydantic_params_action', test_data, browser_session=browser_session)
 
 		# Both should extract the same content (just different prefixes)
+		assert result1.extracted_content is not None
 		assert 'hello-42' in result1.extracted_content
+		assert result2.extracted_content is not None
 		assert 'hello-42' in result2.extracted_content
 		assert 'Individual:' in result1.extracted_content
 		assert 'Pydantic:' in result2.extracted_content
@@ -683,7 +711,7 @@ class TestType2Pattern:
 		registry = Registry()
 
 		@registry.action('Scroll page')
-		async def scroll_page(direction: str = 'down', amount: int = 100, browser_session: BrowserSession = None):
+		async def scroll_page(direction: str = 'down', amount: int = 100, browser_session: BrowserSession = None):  # type: ignore
 			return ActionResult(extracted_content=f'Scrolled {direction} by {amount}')
 
 		action = registry.registry.actions['scroll_page']
@@ -847,7 +875,11 @@ class TestParamsModelGeneration:
 
 		@registry.action('Complex action')
 		async def complex_action(
-			query: str, max_results: int, include_images: bool = True, page: Page = None, browser_session: BrowserSession = None
+			query: str,
+			max_results: int,
+			include_images: bool = True,
+			page: Page = None,  # type: ignore
+			browser_session: BrowserSession = None,  # type: ignore
 		):
 			return ActionResult()
 
@@ -869,7 +901,11 @@ class TestParamsModelGeneration:
 
 		@registry.action('Typed action')
 		async def typed_action(
-			count: int, rate: float, enabled: bool, name: str | None = None, browser_session: BrowserSession = None
+			count: int,
+			rate: float,
+			enabled: bool,
+			name: str | None = None,
+			browser_session: BrowserSession = None,  # type: ignore
 		):
 			return ActionResult()
 
@@ -938,7 +974,7 @@ class TestParameterOrdering:
 			second: int,
 			page: Page,
 			third: bool = True,
-			page_extraction_llm: BaseChatModel = None,
+			page_extraction_llm: BaseChatModel = None,  # type: ignore
 		):
 			return ActionResult()
 
@@ -1006,6 +1042,7 @@ class TestParameterOrdering:
 
 		# Should retry once and succeed
 		result = await registry.execute_action('flaky_action', {'value': 'test'}, browser_session=browser_session)
+		assert result.extracted_content is not None
 		assert 'Success on attempt 2' in result.extracted_content
 		assert call_count == 2
 
@@ -1070,7 +1107,7 @@ class TestParamsModelArgsAndKwargs:
 		# Model that includes browser_session
 		class ModelWithBrowser(ActionModel):
 			value: str = Field(description='Test value')
-			browser_session: BrowserSession = None
+			browser_session: BrowserSession = None  # type: ignore
 
 		# Create a custom param model for select_cell_or_range
 		class CellRangeParams(ActionModel):
@@ -1136,7 +1173,9 @@ class TestParamsModelArgsAndKwargs:
 		# logger.info('\n--- Testing original problematic version ---')
 		try:
 			result1 = await registry.execute_action(
-				'select_cell_or_range', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
+				'select_cell_or_range',
+				{'cell_or_range': 'A1:F100'},
+				browser_session=browser_session,  # type: ignore
 			)
 			# logger.info(f'Success! Result: {result1}')
 		except Exception as e:
@@ -1146,7 +1185,9 @@ class TestParamsModelArgsAndKwargs:
 		# logger.info('\n--- Testing fixed version (positional args) ---')
 		try:
 			result2 = await registry.execute_action(
-				'select_cell_or_range_fixed', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
+				'select_cell_or_range_fixed',
+				{'cell_or_range': 'A1:F100'},
+				browser_session=browser_session,  # type: ignore
 			)
 			# logger.info(f'Success! Result: {result2}')
 		except Exception as e:
@@ -1156,7 +1197,9 @@ class TestParamsModelArgsAndKwargs:
 		# logger.info('\n--- Testing kwargs simulation version ---')
 		try:
 			result3 = await registry.execute_action(
-				'select_with_kwargs', {'cell_or_range': 'A1:F100'}, browser_session=browser_session
+				'select_with_kwargs',
+				{'cell_or_range': 'A1:F100'},
+				browser_session=browser_session,  # type: ignore
 			)
 			# logger.info(f'Success! Result: {result3}')
 		except Exception as e:
diff --git a/tests/ci/test_sync_agent_events.py b/tests/ci/test_sync_agent_events.py
index 2dff3c116..61790958a 100644
--- a/tests/ci/test_sync_agent_events.py
+++ b/tests/ci/test_sync_agent_events.py
@@ -5,20 +5,18 @@ Tests the most critical event flows without excessive duplication.
 """
 
 import base64
-import json
 import os
-from unittest.mock import AsyncMock, MagicMock, Mock, patch
+from unittest.mock import patch
 from uuid import UUID
 
 import pytest
 from dotenv import load_dotenv
 
-# Load environment variables before any imports
 load_dotenv()
-from langchain_core.language_models import BaseChatModel
-from langchain_core.messages import AIMessage
-from pytest_httpserver import HTTPServer
 
+from bubus import BaseEvent
+
+from browser_use import Agent
 from browser_use.agent.cloud_events import (
 	MAX_TASK_LENGTH,
 	CreateAgentOutputFileEvent,
@@ -27,126 +25,14 @@ from browser_use.agent.cloud_events import (
 	CreateAgentTaskEvent,
 	UpdateAgentTaskEvent,
 )
-
-# Skip LLM API key verification for tests
-os.environ['SKIP_LLM_API_KEY_VERIFICATION'] = 'true'
-
-from bubus import BaseEvent
-
-from browser_use import Agent
-from browser_use.browser import BrowserSession
-from browser_use.sync.service import CloudSync
-from tests.ci.mocks import create_mock_llm
-
-
-@pytest.fixture
-async def browser_session():
-	"""Create a real browser session for testing"""
-	session = BrowserSession(
-		headless=True,
-		user_data_dir=None,  # Use temporary directory
-	)
-	yield session
-	await session.stop()
-
-
-@pytest.fixture
-def mock_llm():
-	"""Create a mock LLM that immediately returns done action"""
-	llm = MagicMock(spec=BaseChatModel)
-
-	# Create the JSON response that the agent would parse
-	json_response = {
-		'thinking': 'null',
-		'evaluation_previous_goal': 'Starting task',
-		'memory': 'New task to complete',
-		'next_goal': 'Complete the test task',
-		'action': [{'done': {'success': True, 'text': 'Test completed successfully'}}],
-	}
-
-	# Create a mock response with the JSON
-	mock_response = AIMessage(content=json.dumps(json_response))
-
-	# Make the LLM return our mock response
-	llm.invoke = lambda *args, **kwargs: mock_response
-	llm.ainvoke = AsyncMock(return_value=mock_response)
-
-	# Mock the with_structured_output method to return parsed objects
-	structured_llm = MagicMock()
-
-	async def mock_structured_ainvoke(*args, **kwargs):
-		# The agent will create its own AgentOutput and ActionModel classes
-		# We return the raw response and let the agent parse it
-		return {
-			'raw': mock_response,
-			'parsed': None,  # Let the agent parse it from the raw JSON
-		}
-
-	structured_llm.ainvoke = AsyncMock(side_effect=mock_structured_ainvoke)
-	llm.with_structured_output = lambda *args, **kwargs: structured_llm
-
-	# Set attributes that agent checks
-	llm.model_name = 'gpt-4o'
-	llm._verified_api_keys = True
-	llm._verified_tool_calling_method = 'function_calling'
-
-	return llm
-
-
-@pytest.fixture
-def event_collector():
-	"""Collect all events emitted during tests"""
-	events = []
-	event_order = []
-
-	class EventCollector:
-		def __init__(self):
-			self.events = events
-			self.event_order = event_order
-
-		async def collect_event(self, event: BaseEvent):
-			self.events.append(event)
-			self.event_order.append(event.event_type)
-			return 'collected'
-
-		def get_events_by_type(self, event_type: str) -> list[BaseEvent]:
-			return [e for e in self.events if e.event_type == event_type]
-
-		def clear(self):
-			self.events.clear()
-			self.event_order.clear()
-
-	return EventCollector()
-
-
-@pytest.fixture
-def mock_cloud_sync():
-	"""Create mocked cloud sync service."""
-	sync = Mock(spec=CloudSync)
-	sync.send_event = AsyncMock()
-	sync.authenticate = AsyncMock(return_value=True)
-	sync._authenticated = True
-	sync.handle_event = AsyncMock()
-	return sync
-
-
-@pytest.fixture
-def agent_with_cloud(browser_session, mock_cloud_sync):
-	"""Create agent with cloud sync enabled."""
-	with patch('browser_use.sync.CloudSync', return_value=mock_cloud_sync):
-		with patch.dict(os.environ, {'BROWSERUSE_CLOUD_SYNC': 'true'}):
-			agent = Agent(
-				task='Test task',
-				llm=create_mock_llm(),
-				browser_session=browser_session,
-			)
-			return agent
+from tests.ci.conftest import create_mock_llm
 
 
 class TestAgentEventLifecycle:
 	"""Test critical agent event flows with minimal duplication"""
 
-	async def test_agent_lifecycle_events(self, mock_llm, browser_session, event_collector, httpserver: HTTPServer):
+	@pytest.mark.usefixtures('mock_llm', 'browser_session', 'event_collector', 'httpserver')
+	async def test_agent_lifecycle_events(self, mock_llm, browser_session, event_collector, httpserver):
 		"""Test that all events are emitted in the correct order during agent lifecycle"""
 
 		# Setup a test page
@@ -155,23 +41,19 @@ class TestAgentEventLifecycle:
 		# Navigate to test page
 		await browser_session.navigate(httpserver.url_for('/'))
 
-		# Patch environment variables to use localhost for CloudSync
-		with patch.dict(
-			os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000', 'BROWSER_USE_CLOUD_UI_URL': 'http://localhost:3000'}
-		):
-			# Create agent
-			agent = Agent(
-				task='Test task',
-				llm=mock_llm,
-				browser_session=browser_session,
-				generate_gif=False,  # Don't generate GIF for faster test
-			)
+		# Create agent (environment already set up by conftest.py)
+		agent = Agent(
+			task='Test task',
+			llm=mock_llm,
+			browser_session=browser_session,
+			generate_gif=False,  # Don't generate GIF for faster test
+		)
 
-			# Subscribe to all events
-			agent.eventbus.on('*', event_collector.collect_event)
+		# Subscribe to all events
+		agent.eventbus.on('*', event_collector.collect_event)
 
-			# Run the agent
-			history = await agent.run(max_steps=5)
+		# Run the agent
+		history = await agent.run(max_steps=5)
 
 		# Verify we got a successful completion
 		assert history.is_done()
@@ -213,30 +95,28 @@ class TestAgentEventLifecycle:
 		assert update_event.id == task_event.id
 		assert update_event.done_output is not None
 
-	async def test_agent_with_gif_generation(self, mock_llm, browser_session, event_collector, httpserver: HTTPServer):
+	@pytest.mark.usefixtures('mock_llm', 'browser_session', 'event_collector', 'httpserver')
+	async def test_agent_with_gif_generation(self, mock_llm, browser_session, cloud_sync, event_collector, httpserver):
 		"""Test that GIF generation triggers CreateAgentOutputFileEvent"""
 
 		# Setup a test page
 		httpserver.expect_request('/').respond_with_data('<html><body><h1>GIF Test</h1></body></html>', content_type='text/html')
 		await browser_session.navigate(httpserver.url_for('/'))
 
-		# Patch environment variables to use localhost for CloudSync
-		with patch.dict(
-			os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000', 'BROWSER_USE_CLOUD_UI_URL': 'http://localhost:3000'}
-		):
-			# Create agent with GIF generation
-			agent = Agent(
-				task='Test task with GIF',
-				llm=mock_llm,
-				browser_session=browser_session,
-				generate_gif=True,  # Enable GIF generation
-			)
+		# Create agent with GIF generation
+		agent = Agent(
+			task='Test task with GIF',
+			llm=mock_llm,
+			browser_session=browser_session,
+			generate_gif=True,  # Enable GIF generation
+			cloud_sync=cloud_sync,
+		)
 
-			# Subscribe to all events
-			agent.eventbus.on('*', event_collector.collect_event)
+		# Subscribe to all events
+		agent.eventbus.on('*', event_collector.collect_event)
 
-			# Run the agent
-			history = await agent.run(max_steps=5)
+		# Run the agent
+		_history = await agent.run(max_steps=5)
 
 		# Verify CreateAgentOutputFileEvent was emitted
 		output_file_events = event_collector.get_events_by_type('CreateAgentOutputFileEvent')
@@ -255,7 +135,8 @@ class TestAgentEventLifecycle:
 		assert gif_bytes.startswith(b'GIF87a') or gif_bytes.startswith(b'GIF89a')
 		assert len(gif_bytes) > 100  # Should be a real GIF file
 
-	async def test_step_screenshot_capture(self, mock_llm, browser_session, event_collector, httpserver: HTTPServer):
+	@pytest.mark.usefixtures('mock_llm', 'browser_session', 'event_collector', 'httpserver')
+	async def test_step_screenshot_capture(self, mock_llm, browser_session, cloud_sync, event_collector, httpserver):
 		"""Test that screenshots are captured for each step"""
 
 		# Setup test page
@@ -264,23 +145,20 @@ class TestAgentEventLifecycle:
 		)
 		await browser_session.navigate(httpserver.url_for('/'))
 
-		# Patch environment variables to use localhost for CloudSync
-		with patch.dict(
-			os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000', 'BROWSER_USE_CLOUD_UI_URL': 'http://localhost:3000'}
-		):
-			# Create agent
-			agent = Agent(
-				task='Test screenshot capture',
-				llm=mock_llm,
-				browser_session=browser_session,
-				generate_gif=False,
-			)
+		# Create agent
+		agent = Agent(
+			task='Test screenshot capture',
+			llm=mock_llm,
+			browser_session=browser_session,
+			generate_gif=False,
+			cloud_sync=cloud_sync,
+		)
 
-			# Subscribe to all events
-			agent.eventbus.on('*', event_collector.collect_event)
+		# Subscribe to all events
+		agent.eventbus.on('*', event_collector.collect_event)
 
-			# Run the agent
-			await agent.run(max_steps=3)
+		# Run the agent
+		await agent.run(max_steps=3)
 
 		# Get all step events
 		step_events = event_collector.get_events_by_type('CreateAgentStepEvent')
@@ -304,75 +182,107 @@ class TestAgentEventLifecycle:
 class TestAgentCloudIntegration:
 	"""Test that agent properly integrates with cloud sync service"""
 
-	async def test_agent_emits_events_to_cloud(self, agent_with_cloud, mock_cloud_sync):
+	@pytest.mark.usefixtures('agent_with_cloud', 'event_collector', 'httpserver')
+	async def test_agent_emits_events_to_cloud(self, agent_with_cloud, event_collector, httpserver):
 		"""Test that agent emits all required events to cloud sync."""
+		# Set up httpserver to capture events
+		captured_events = []
+
+		def capture_events(request):
+			data = request.get_json()
+			captured_events.extend(data.get('events', []))
+			from werkzeug.wrappers import Response
+
+			return Response(
+				'{"processed": 1, "failed": 0, "results": [{"success": true}]}', status=200, mimetype='application/json'
+			)
+
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_events)
+
+		# Subscribe to eventbus to verify events
+		agent_with_cloud.eventbus.on('*', event_collector.collect_event)
+
 		# Run agent
 		await agent_with_cloud.run()
 
-		# Check that events were sent to cloud sync
-		calls = mock_cloud_sync.handle_event.call_args_list
-		assert len(calls) >= 4  # At minimum: session, task, step, update
+		# Verify we have the core event types in eventbus
+		assert len(event_collector.event_order) >= 4  # At minimum: session, task, step, update
+		assert 'CreateAgentSessionEvent' in event_collector.event_order
+		assert 'CreateAgentTaskEvent' in event_collector.event_order
+		assert 'CreateAgentStepEvent' in event_collector.event_order
+		assert 'UpdateAgentTaskEvent' in event_collector.event_order
 
-		# Verify we have the core event types
-		event_types = [call.args[0].event_type for call in calls]
-		assert 'CreateAgentSessionEvent' in event_types
-		assert 'CreateAgentTaskEvent' in event_types
-		assert 'CreateAgentStepEvent' in event_types
-		assert 'UpdateAgentTaskEvent' in event_types
+		# Verify events were sent to cloud
+		assert len(captured_events) >= 4
 
-		# Verify event content
-		session_events = [call for call in calls if call.args[0].event_type == 'CreateAgentSessionEvent']
-		task_events = [call for call in calls if call.args[0].event_type == 'CreateAgentTaskEvent']
-		step_events = [call for call in calls if call.args[0].event_type == 'CreateAgentStepEvent']
+		# Verify event relationships using event_collector
+		session_events = event_collector.get_events_by_type('CreateAgentSessionEvent')
+		task_events = event_collector.get_events_by_type('CreateAgentTaskEvent')
+		step_events = event_collector.get_events_by_type('CreateAgentStepEvent')
 
 		assert len(session_events) == 1
 		assert len(task_events) == 1
 		assert len(step_events) >= 1
 
 		# Verify event relationships
-		session_event = session_events[0].args[0]
-		task_event = task_events[0].args[0]
-		step_event = step_events[0].args[0]
+		session_event = session_events[0]
+		task_event = task_events[0]
+		step_event = step_events[0]
 
 		assert task_event.agent_session_id == session_event.id
 		assert step_event.agent_task_id == task_event.id
 
-	async def test_agent_emits_session_start_event(self, agent_with_cloud, mock_cloud_sync):
+	@pytest.mark.usefixtures('agent_with_cloud', 'event_collector', 'httpserver')
+	async def test_agent_emits_session_start_event(self, agent_with_cloud, event_collector, httpserver):
 		"""Test that agent emits session start event."""
+		# Set up httpserver endpoint
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
+			{'processed': 1, 'failed': 0, 'results': [{'success': True}]}
+		)
+
+		# Subscribe to events
+		agent_with_cloud.eventbus.on('*', event_collector.collect_event)
+
 		# Run agent
 		await agent_with_cloud.run()
 
 		# Check that session start event was sent
-		calls = mock_cloud_sync.handle_event.call_args_list
-		session_events = [call for call in calls if call.args[0].event_type == 'CreateAgentSessionEvent']
+		session_events = event_collector.get_events_by_type('CreateAgentSessionEvent')
 
 		assert len(session_events) == 1
-		event = session_events[0].args[0]
+		event = session_events[0]
 		assert hasattr(event, 'id')
 		assert hasattr(event, 'browser_session_id')
 
-	async def test_agent_emits_task_events(self, agent_with_cloud, mock_cloud_sync):
+	@pytest.mark.usefixtures('agent_with_cloud', 'event_collector', 'httpserver')
+	async def test_agent_emits_task_events(self, agent_with_cloud, event_collector, httpserver):
 		"""Test that agent emits task events."""
+		# Set up httpserver endpoint
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
+			{'processed': 1, 'failed': 0, 'results': [{'success': True}]}
+		)
+
+		# Subscribe to events
+		agent_with_cloud.eventbus.on('*', event_collector.collect_event)
+
 		# Run agent
 		await agent_with_cloud.run()
 
 		# Check task events
-		calls = mock_cloud_sync.handle_event.call_args_list
-
-		# Should have CreateAgentTaskEvent
-		create_task_events = [call for call in calls if call.args[0].event_type == 'CreateAgentTaskEvent']
+		create_task_events = event_collector.get_events_by_type('CreateAgentTaskEvent')
 		assert len(create_task_events) == 1
-		create_event = create_task_events[0].args[0]
+		create_event = create_task_events[0]
 		assert create_event.task == 'Test task'
 		assert hasattr(create_event, 'agent_session_id')
 
 		# Should have UpdateAgentTaskEvent when done
-		update_task_events = [call for call in calls if call.args[0].event_type == 'UpdateAgentTaskEvent']
+		update_task_events = event_collector.get_events_by_type('UpdateAgentTaskEvent')
 		assert len(update_task_events) >= 1
 
+	@pytest.mark.usefixtures('browser_session')
 	async def test_cloud_sync_disabled(self, browser_session):
 		"""Test that cloud sync can be disabled."""
-		with patch.dict(os.environ, {'BROWSERUSE_CLOUD_SYNC': 'false'}):
+		with patch.dict(os.environ, {'BROWSER_USE_CLOUD_SYNC': 'false'}):
 			agent = Agent(
 				task='Test task',
 				llm=create_mock_llm(),
@@ -384,75 +294,75 @@ class TestAgentCloudIntegration:
 			# Run agent - should work without cloud sync
 			await agent.run()
 
-	async def test_agent_error_resilience(self, agent_with_cloud, mock_cloud_sync):
+	@pytest.mark.usefixtures('agent_with_cloud', 'httpserver')
+	async def test_agent_error_resilience(self, agent_with_cloud, httpserver):
 		"""Test that agent continues working even if cloud sync fails."""
-		# Make cloud sync fail
-		mock_cloud_sync.handle_event.side_effect = Exception('Cloud sync error')
 
-		# Run agent - should not raise exception
+		# Make cloud endpoint fail
+		def fail_handler(request):
+			from werkzeug.wrappers import Response
+
+			return Response('Server error', status=500, mimetype='text/plain')
+
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(fail_handler)
+
+		# Run agent - should not raise exception despite cloud sync failures
 		result = await agent_with_cloud.run()
 
 		# Agent should complete successfully despite sync failures
 		assert result is not None
 		assert result.is_done()
 
-		# Verify cloud sync was attempted
-		assert mock_cloud_sync.handle_event.call_count > 0
-
-	async def test_session_id_persistence(self, browser_session):
+	@pytest.mark.usefixtures('browser_session', 'cloud_sync', 'event_collector', 'httpserver')
+	async def test_session_id_persistence(self, browser_session, cloud_sync, event_collector, httpserver):
 		"""Test that agent session ID persists across runs."""
-		mock_sync = Mock(spec=CloudSync)
-		mock_sync.send_event = AsyncMock()
-		mock_sync.handle_event = AsyncMock()
-		mock_sync._authenticated = True
+		# Set up httpserver endpoint
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
+			{'processed': 1, 'failed': 0, 'results': [{'success': True}]}
+		)
 
-		with patch('browser_use.sync.CloudSync', return_value=mock_sync):
-			with patch.dict(os.environ, {'BROWSERUSE_CLOUD_SYNC': 'true'}):
-				# Create first agent
-				agent1 = Agent(
-					task='First task',
-					llm=create_mock_llm(),
-					browser_session=browser_session,
-				)
-				agent1.cloud_sync = mock_sync
+		# Create first agent
+		agent1 = Agent(
+			task='First task',
+			llm=create_mock_llm(),
+			browser_session=browser_session,
+			cloud_sync=cloud_sync,
+		)
+		agent1.eventbus.on('*', event_collector.collect_event)
 
-				# Run first agent
-				await agent1.run()
+		# Run first agent
+		await agent1.run()
 
-				# Get session ID from first run
-				session_calls = [
-					call for call in mock_sync.handle_event.call_args_list if call.args[0].event_type == 'CreateAgentSessionEvent'
-				]
-				session_id_1 = session_calls[0].args[0].id
+		# Get session ID from first run
+		session_events = event_collector.get_events_by_type('CreateAgentSessionEvent')
+		assert len(session_events) == 1
+		session_id_1 = session_events[0].id
 
-				# Create second agent (will have different session ID)
-				agent2 = Agent(
-					task='Second task',
-					llm=create_mock_llm(),
-					browser_session=browser_session,
-				)
-				agent2.cloud_sync = mock_sync
+		# Clear event collector
+		event_collector.clear()
 
-				# Clear previous calls
-				mock_sync.handle_event.reset_mock()
+		# Create second agent (will have different session ID)
+		agent2 = Agent(
+			task='Second task',
+			llm=create_mock_llm(),
+			browser_session=browser_session,
+			cloud_sync=cloud_sync,
+		)
+		agent2.eventbus.on('*', event_collector.collect_event)
 
-				# Run second agent
-				await agent2.run()
+		# Run second agent
+		await agent2.run()
 
-				# Should create new session for new agent
-				session_calls_2 = [
-					call for call in mock_sync.handle_event.call_args_list if call.args[0].event_type == 'CreateAgentSessionEvent'
-				]
-				assert len(session_calls_2) == 1  # New session created
+		# Should create new session for new agent
+		session_events_2 = event_collector.get_events_by_type('CreateAgentSessionEvent')
+		assert len(session_events_2) == 1  # New session created
+		session_id_2 = session_events_2[0].id
 
-				# Should create new task with new session ID
-				task_calls = [
-					call for call in mock_sync.handle_event.call_args_list if call.args[0].event_type == 'CreateAgentTaskEvent'
-				]
-				assert len(task_calls) == 1
-				session_id_2 = session_calls_2[0].args[0].id
-				assert task_calls[0].args[0].agent_session_id == session_id_2
-				assert session_id_2 != session_id_1  # Different session IDs
+		# Should create new task with new session ID
+		task_events = event_collector.get_events_by_type('CreateAgentTaskEvent')
+		assert len(task_events) == 1
+		assert task_events[0].agent_session_id == session_id_2
+		assert session_id_2 != session_id_1  # Different session IDs
 
 
 class TestEventValidation:
@@ -475,6 +385,10 @@ class TestEventValidation:
 				agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c651',
 				task='test',
 				llm_model='gpt-4o',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
 			),
 			CreateAgentStepEvent(
 				user_id='0683fb03-c5da-79c9-8000-d3a39c47c650',
@@ -484,6 +398,7 @@ class TestEventValidation:
 				memory='mem',
 				next_goal='next',
 				actions=[],
+				screenshot_url='data:image/png;...',
 			),
 		]
 
@@ -512,12 +427,23 @@ class TestEventValidation:
 				agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c659',
 				llm_model='test-model',
 				task=long_task,
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
 			)
 
 	def test_event_type_assignment(self):
 		"""Test that event_type is properly set and validated"""
 		event = CreateAgentTaskEvent(
-			user_id='test', agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c659', llm_model='test-model', task='test'
+			user_id='test',
+			agent_session_id='0683fb03-c5da-79c9-8000-d3a39c47c659',
+			llm_model='test-model',
+			task='test',
+			done_output=None,
+			user_feedback_type=None,
+			user_comment=None,
+			gif_url=None,
 		)
 
 		# Event type should be automatically set
diff --git a/tests/ci/test_sync_client.py b/tests/ci/test_sync_client.py
index c5e05bd37..adf400f6f 100644
--- a/tests/ci/test_sync_client.py
+++ b/tests/ci/test_sync_client.py
@@ -3,13 +3,13 @@
 import os
 import tempfile
 from pathlib import Path
-from unittest.mock import patch
 
 import httpx
 import pytest
 from bubus import BaseEvent
 from pytest_httpserver import HTTPServer
 
+from browser_use.agent.cloud_events import CreateAgentTaskEvent
 from browser_use.sync.auth import TEMP_USER_ID, DeviceAuthClient
 from browser_use.sync.service import CloudSync
 
@@ -21,24 +21,10 @@ def temp_config_dir():
 		temp_dir = Path(tmpdir) / '.config' / 'browseruse'
 		temp_dir.mkdir(parents=True, exist_ok=True)
 
-		# Temporarily replace the config dir
-		import browser_use.sync.auth
-		import browser_use.utils
-
-		original_auth = getattr(browser_use.sync.auth, 'BROWSER_USE_CONFIG_DIR', None)
-		original_utils = getattr(browser_use.utils, 'BROWSER_USE_CONFIG_DIR', None)
-
-		browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = temp_dir
-		browser_use.utils.BROWSER_USE_CONFIG_DIR = temp_dir
+		os.environ['BROWSER_USE_CONFIG_DIR'] = str(temp_dir)
 
 		yield temp_dir
 
-		# Restore original
-		if original_auth:
-			browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = original_auth
-		if original_utils:
-			browser_use.utils.BROWSER_USE_CONFIG_DIR = original_utils
-
 
 @pytest.fixture
 async def http_client(httpserver: HTTPServer):
@@ -52,26 +38,23 @@ class TestCloudSyncInit:
 
 	async def test_init_with_auth_enabled(self, temp_config_dir):
 		"""Test CloudSync initialization with auth enabled."""
-		# Set test environment variable
-		with patch.dict(os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000'}):
-			service = CloudSync(enable_auth=True)
+		service = CloudSync(enable_auth=True, base_url='http://localhost:8000')
 
-			assert service.base_url == 'http://localhost:8000'
-			assert service.enable_auth is True
-			assert service.auth_client is not None
-			assert isinstance(service.auth_client, DeviceAuthClient)
-			assert service.pending_events == []
-			assert service.session_id is None
+		assert service.base_url == 'http://localhost:8000'
+		assert service.enable_auth is True
+		assert service.auth_client is not None
+		assert isinstance(service.auth_client, DeviceAuthClient)
+		assert service.pending_events == []
+		assert service.session_id is None
 
 	async def test_init_with_auth_disabled(self, temp_config_dir):
 		"""Test CloudSync initialization with auth disabled."""
-		with patch.dict(os.environ, {'BROWSER_USE_CLOUD_URL': 'http://localhost:8000'}):
-			service = CloudSync(enable_auth=False)
+		service = CloudSync(enable_auth=False, base_url='http://localhost:8000')
 
-			assert service.base_url == 'http://localhost:8000'
-			assert service.enable_auth is False
-			assert service.auth_client is None
-			assert service.pending_events == []
+		assert service.base_url == 'http://localhost:8000'
+		assert service.enable_auth is False
+		assert service.auth_client is None
+		assert service.pending_events == []
 
 
 class TestCloudSyncEventHandling:
@@ -107,10 +90,21 @@ class TestCloudSyncEventHandling:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Send event
-		await authenticated_sync.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task', priority='high'))
+		await authenticated_sync.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Verify forwarding
 		assert len(requests) == 1
@@ -122,22 +116,32 @@ class TestCloudSyncEventHandling:
 		assert event['user_id'] == 'test-user-123'
 		# BaseEvent creates event_type attribute, plus our custom data as attributes
 		assert event['task'] == 'Test task'
-		assert event['priority'] == 'high'
 
 	async def test_event_queueing_unauthenticated(self, httpserver: HTTPServer, unauthenticated_sync):
 		"""Test event queueing when unauthenticated."""
 		# Server returns 401
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_json({'error': 'unauthorized'}, status=401)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_json({'error': 'unauthorized'}, status=401)
 
 		# Send event
-		await unauthenticated_sync.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Queued task'))
+		await unauthenticated_sync.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Queued task',
+				user_id=TEMP_USER_ID,
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Event should be queued
 		assert len(unauthenticated_sync.pending_events) == 1
 		queued_event = unauthenticated_sync.pending_events[0]
-		assert queued_event['event_type'] == 'CreateAgentTaskEvent'
-		assert queued_event['user_id'] == TEMP_USER_ID
-		assert queued_event['task'] == 'Queued task'
+		assert queued_event.event_type == 'CreateAgentTaskEvent'
+		assert queued_event.user_id == TEMP_USER_ID
+		assert queued_event.task == 'Queued task'
 
 	async def test_event_user_id_injection_pre_auth(self, httpserver: HTTPServer, unauthenticated_sync):
 		"""Test that temp user ID is injected for pre-auth events."""
@@ -149,10 +153,21 @@ class TestCloudSyncEventHandling:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Send event without user_id
-		await unauthenticated_sync.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Pre-auth task'))
+		await unauthenticated_sync.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Pre-auth task',
+				user_id=TEMP_USER_ID,
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Verify temp user ID was injected
 		assert len(requests) == 1
@@ -185,21 +200,31 @@ class TestCloudSyncRetryLogic:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Manually add pending events (simulating 401 scenario)
 		sync_with_auth.pending_events.extend(
 			[
-				{
-					'event_type': 'CreateAgentTaskEvent',
-					'task': 'Pending task 1',
-					'user_id': TEMP_USER_ID,
-				},
-				{
-					'event_type': 'CreateAgentTaskEvent',
-					'task': 'Pending task 2',
-					'user_id': TEMP_USER_ID,
-				},
+				CreateAgentTaskEvent(
+					agent_session_id='test-session',
+					llm_model='test-model',
+					task='Pending task 1',
+					user_id=TEMP_USER_ID,
+					done_output=None,
+					user_feedback_type=None,
+					user_comment=None,
+					gif_url=None,
+				),
+				CreateAgentTaskEvent(
+					agent_session_id='test-session',
+					llm_model='test-model',
+					task='Pending task 2',
+					user_id=TEMP_USER_ID,
+					done_output=None,
+					user_feedback_type=None,
+					user_comment=None,
+					gif_url=None,
+				),
 			]
 		)
 
@@ -219,10 +244,21 @@ class TestCloudSyncRetryLogic:
 	async def test_backend_error_resilience(self, httpserver: HTTPServer, sync_with_auth):
 		"""Test resilience to backend errors."""
 		# Server returns 500 error
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_data('Internal Server Error', status=500)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_data('Internal Server Error', status=500)
 
 		# Should not raise exception
-		await sync_with_auth.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Task during outage'))
+		await sync_with_auth.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Task during outage',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Events should not be queued for 500 errors (only 401)
 		assert len(sync_with_auth.pending_events) == 0
@@ -233,7 +269,18 @@ class TestCloudSyncRetryLogic:
 		sync_with_auth.base_url = 'http://localhost:99999'  # Invalid port
 
 		# Should not raise exception
-		await sync_with_auth.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Task during network error'))
+		await sync_with_auth.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Task during network error',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Should handle gracefully without crashing
 
@@ -249,12 +296,23 @@ class TestCloudSyncRetryLogic:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Send multiple events concurrently
 		tasks = []
 		for i in range(5):
-			task = sync_with_auth.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Concurrent task {i}'))
+			task = sync_with_auth.handle_event(
+				CreateAgentTaskEvent(
+					agent_session_id='test-session',
+					llm_model='test-model',
+					task=f'Concurrent task {i}',
+					user_id='test-user-123',
+					done_output=None,
+					user_feedback_type=None,
+					user_comment=None,
+					gif_url=None,
+				)
+			)
 			tasks.append(task)
 
 		await asyncio.gather(*tasks)
@@ -286,7 +344,7 @@ class TestCloudSyncBackendCommunication:
 			assert len(data['events']) == 1
 
 			event = data['events'][0]
-			required_fields = ['event_type', 'event_id', 'event_at', 'event_schema', 'data']
+			required_fields = ['event_type', 'event_id', 'event_created_at', 'event_schema', 'user_id']
 			for field in required_fields:
 				assert field in event, f'Missing required field: {field}'
 
@@ -294,7 +352,7 @@ class TestCloudSyncBackendCommunication:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Create authenticated service
 		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -305,7 +363,18 @@ class TestCloudSyncBackendCommunication:
 		service.auth_client = auth
 		service.session_id = 'test-session-id'
 
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Format validation test'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Format validation test',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		assert len(requests) == 1
 
@@ -324,7 +393,7 @@ class TestCloudSyncBackendCommunication:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Test authenticated request
 		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -334,7 +403,18 @@ class TestCloudSyncBackendCommunication:
 		service = CloudSync(base_url=httpserver.url_for(''), enable_auth=True)
 		service.auth_client = auth
 
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Auth header test'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Auth header test',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Check auth header was included
 		assert len(requests) == 1
@@ -346,7 +426,18 @@ class TestCloudSyncBackendCommunication:
 		requests.clear()
 		service.auth_client = DeviceAuthClient(base_url=httpserver.url_for(''))  # No credentials
 
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='No auth test'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='No auth test',
+				user_id='',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Check no auth header
 		assert len(requests) == 1
@@ -368,7 +459,18 @@ class TestCloudSyncErrorHandling:
 		sync_service.base_url = 'http://10.255.255.1'  # Non-routable IP for timeout
 
 		# Should not raise exception
-		await sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Timeout test'))
+		await sync_service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Timeout test',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 	async def test_malformed_event_handling(self, httpserver: HTTPServer, sync_service):
 		"""Test handling of events that can't be serialized."""
@@ -389,20 +491,42 @@ class TestCloudSyncErrorHandling:
 		error_codes = [400, 403, 404, 429, 500, 502, 503]
 
 		for status_code in error_codes:
-			httpserver.expect_request('/api/v1/events/', method='POST').respond_with_json(
+			httpserver.expect_request('/api/v1/events', method='POST').respond_with_json(
 				{'error': f'Test error {status_code}'}, status=status_code
 			)
 
 			# Should not raise exception
-			await sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Error {status_code} test'))
+			await sync_service.handle_event(
+				CreateAgentTaskEvent(
+					agent_session_id='test-session',
+					llm_model='test-model',
+					task=f'Error {status_code} test',
+					user_id='test-user-123',
+					done_output=None,
+					user_feedback_type=None,
+					user_comment=None,
+					gif_url=None,
+				)
+			)
 
 	async def test_invalid_response_handling(self, httpserver: HTTPServer, sync_service):
 		"""Test handling of invalid server responses."""
 		# Return invalid JSON
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_data('Not JSON', status=200)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_data('Not JSON', status=200)
 
 		# Should not raise exception
-		await sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Invalid response test'))
+		await sync_service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Invalid response test',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 	async def test_event_with_restricted_attributes(self, httpserver: HTTPServer, sync_service):
 		"""Test handling events that don't allow user_id attribute."""
@@ -415,7 +539,7 @@ class TestCloudSyncErrorHandling:
 			event_type: str = 'RestrictedEvent'
 			data: str = 'test'
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_json({'processed': 1}, status=200)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_json({'processed': 1}, status=200)
 
 		# Should not raise exception - will log debug message about not being able to set user_id
 		await sync_service.handle_event(RestrictedEvent())
@@ -441,12 +565,23 @@ class TestCloudSyncErrorHandling:
 
 				return Response('{"processed": 1}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(handler)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(handler)
 
 		# Send 10 events concurrently
 		tasks = []
 		for i in range(10):
-			task = sync_service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Concurrent error test {i}'))
+			task = sync_service.handle_event(
+				CreateAgentTaskEvent(
+					agent_session_id='test-session',
+					llm_model='test-model',
+					task=f'Concurrent error test {i}',
+					user_id='test-user-123',
+					done_output=None,
+					user_feedback_type=None,
+					user_comment=None,
+					gif_url=None,
+				)
+			)
 			tasks.append(task)
 
 		# All should complete without raising
diff --git a/tests/ci/test_sync_client_auth.py b/tests/ci/test_sync_client_auth.py
index 6f8055a26..7e51e329a 100644
--- a/tests/ci/test_sync_client_auth.py
+++ b/tests/ci/test_sync_client_auth.py
@@ -6,7 +6,6 @@ import json
 import tempfile
 from datetime import datetime
 from pathlib import Path
-from unittest.mock import patch
 
 import anyio
 import httpx
@@ -17,37 +16,26 @@ from pytest_httpserver import HTTPServer
 # Load environment variables before any imports
 load_dotenv()
 
-from bubus import BaseEvent
 
+from browser_use.agent.cloud_events import CreateAgentSessionEvent, CreateAgentTaskEvent
 from browser_use.sync.auth import TEMP_USER_ID, DeviceAuthClient
 from browser_use.sync.service import CloudSync
 
-# Define config dir for tests
-# BROWSER_USE_CONFIG_DIR = Path.home() / ".config" / "browseruse"
-BROWSER_USE_CONFIG_DIR = Path(tempfile.mkdtemp()) / '.config' / 'browseruse'
+# Define config dir for tests - not needed anymore since we'll use env vars
 
 
 @pytest.fixture
-def temp_config_dir():
+def temp_config_dir(monkeypatch):
 	"""Create temporary config directory."""
 	with tempfile.TemporaryDirectory() as tmpdir:
 		temp_dir = Path(tmpdir) / '.config' / 'browseruse'
 		temp_dir.mkdir(parents=True, exist_ok=True)
 
-		# Temporarily replace the config dir
-		original = BROWSER_USE_CONFIG_DIR
-		import browser_use.sync.auth
-		import browser_use.utils
-
-		browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = temp_dir
-		browser_use.utils.BROWSER_USE_CONFIG_DIR = temp_dir
+		# Use monkeypatch to set the environment variable
+		monkeypatch.setenv('BROWSER_USE_CONFIG_DIR', str(temp_dir))
 
 		yield temp_dir
 
-		# Restore original
-		browser_use.sync.auth.BROWSER_USE_CONFIG_DIR = original
-		browser_use.utils.BROWSER_USE_CONFIG_DIR = original
-
 
 @pytest.fixture
 async def http_client(httpserver: HTTPServer):
@@ -59,23 +47,23 @@ async def http_client(httpserver: HTTPServer):
 class TestDeviceAuthClient:
 	"""Test DeviceAuthClient class."""
 
-	async def test_init_creates_config_dir(self, temp_config_dir):
+	async def test_init_creates_config_dir(self, temp_config_dir, httpserver):
 		"""Test that initialization creates config directory."""
-		auth = DeviceAuthClient()
+		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
 		assert temp_config_dir.exists()
 		assert (temp_config_dir / 'cloud_auth.json').exists() is False
 
-	async def test_load_credentials_no_file(self, temp_config_dir):
+	async def test_load_credentials_no_file(self, temp_config_dir, httpserver):
 		"""Test loading credentials when file doesn't exist."""
-		auth = DeviceAuthClient()
+		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
 		# When no file exists, auth_config should have no token/user_id
 		assert auth.auth_config.api_token is None
 		assert auth.auth_config.user_id is None
 		assert not auth.is_authenticated
 
-	async def test_save_and_load_credentials(self, temp_config_dir):
+	async def test_save_and_load_credentials(self, temp_config_dir, httpserver):
 		"""Test saving and loading credentials."""
-		auth = DeviceAuthClient()
+		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
 
 		# Update auth config and save
 		auth.auth_config.api_token = 'test-key-123'
@@ -84,7 +72,7 @@ class TestDeviceAuthClient:
 		auth.auth_config.save_to_file()
 
 		# Load in a new instance
-		auth2 = DeviceAuthClient()
+		auth2 = DeviceAuthClient(base_url=httpserver.url_for(''))
 		assert auth2.auth_config.api_token == 'test-key-123'
 		assert auth2.auth_config.user_id == 'test-user-123'
 		assert auth2.is_authenticated
@@ -94,9 +82,9 @@ class TestDeviceAuthClient:
 		stat = (temp_config_dir / 'cloud_auth.json').stat()
 		assert oct(stat.st_mode)[-3:] == '600'
 
-	async def test_is_authenticated(self, temp_config_dir):
+	async def test_is_authenticated(self, temp_config_dir, httpserver):
 		"""Test authentication status check."""
-		auth = DeviceAuthClient()
+		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
 
 		# Not authenticated initially
 		assert auth.is_authenticated is False
@@ -107,12 +95,12 @@ class TestDeviceAuthClient:
 		auth.auth_config.save_to_file()
 
 		# Reload to verify persistence
-		auth2 = DeviceAuthClient()
+		auth2 = DeviceAuthClient(base_url=httpserver.url_for(''))
 		assert auth2.is_authenticated is True
 
-	async def test_get_credentials(self, temp_config_dir):
+	async def test_get_credentials(self, temp_config_dir, httpserver):
 		"""Test getting credentials."""
-		auth = DeviceAuthClient()
+		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
 
 		# No credentials initially
 		assert auth.api_token is None
@@ -299,9 +287,9 @@ class TestDeviceAuthClient:
 		assert result is None  # Should timeout and return None
 		assert not auth.is_authenticated
 
-	async def test_logout(self, temp_config_dir):
+	async def test_logout(self, temp_config_dir, httpserver):
 		"""Test logout functionality."""
-		auth = DeviceAuthClient()
+		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
 
 		# Save credentials directly using auth_config
 		auth.auth_config.api_token = 'test-key'
@@ -319,7 +307,7 @@ class TestDeviceAuthClient:
 		assert (temp_config_dir / 'cloud_auth.json').exists()
 
 		# Verify the file contains empty credentials
-		auth2 = DeviceAuthClient()
+		auth2 = DeviceAuthClient(base_url=httpserver.url_for(''))
 		assert auth2.auth_config.api_token is None
 		assert auth2.auth_config.user_id is None
 
@@ -327,14 +315,14 @@ class TestDeviceAuthClient:
 class TestCloudSync:
 	"""Test CloudSync class."""
 
-	async def test_init(self, temp_config_dir):
+	async def test_init(self, temp_config_dir, httpserver):
 		"""Test CloudSync initialization."""
 		service = CloudSync(
-			base_url='https://cloud.browser-use.com',
+			base_url=httpserver.url_for(''),
 			enable_auth=True,
 		)
 
-		assert service.base_url == 'https://cloud.browser-use.com'
+		assert service.base_url == httpserver.url_for('')
 		assert service.enable_auth is True
 		assert service.auth_client is not None
 		assert isinstance(service.auth_client, DeviceAuthClient)
@@ -355,7 +343,7 @@ class TestCloudSync:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Create authenticated service
 		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -367,12 +355,18 @@ class TestCloudSync:
 		service.session_id = 'test-session-id'
 
 		# Send event
-		event_data = {
-			'task': 'Test task',
-			'status': 'running',
-		}
-
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', **event_data))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Check request was made
 		assert len(requests) == 1
@@ -388,7 +382,6 @@ class TestCloudSync:
 		assert event['event_type'] == 'CreateAgentTaskEvent'
 		assert event['user_id'] == 'test-user-123'
 		assert event['task'] == 'Test task'
-		assert event['status'] == 'running'
 
 	async def test_send_event_pre_auth(self, httpserver: HTTPServer, temp_config_dir):
 		"""Test sending event before authentication."""
@@ -405,7 +398,7 @@ class TestCloudSync:
 
 			return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(capture_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(capture_request)
 
 		# Create unauthenticated service
 		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -416,12 +409,18 @@ class TestCloudSync:
 		service.session_id = 'test-session-id'
 
 		# Send event
-		event_data = {
-			'task': 'Test task',
-			'status': 'running',
-		}
-
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', **event_data))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task',
+				user_id=TEMP_USER_ID,
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Check request was made without auth header
 		assert len(requests) == 1
@@ -435,7 +434,6 @@ class TestCloudSync:
 		assert event['event_type'] == 'CreateAgentTaskEvent'
 		assert event['user_id'] == TEMP_USER_ID
 		assert event['task'] == 'Test task'
-		assert event['status'] == 'running'
 
 	async def test_authenticate_and_resend(self, httpserver: HTTPServer, temp_config_dir):
 		"""Test authentication flow with pre-auth event resending."""
@@ -461,7 +459,7 @@ class TestCloudSync:
 				# Subsequent requests: success
 				return Response('{"processed": 1, "failed": 0}', status=200, mimetype='application/json')
 
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_handler(handle_events_request)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_handler(handle_events_request)
 
 		# Create service with unauthenticated auth client
 		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -472,12 +470,23 @@ class TestCloudSync:
 		service.session_id = 'test-session-id'
 
 		# Send pre-auth event (should get 401 and be queued)
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Pre-auth task'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Pre-auth task',
+				user_id=TEMP_USER_ID,
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Event should be in pending_events since we got 401
 		assert len(service.pending_events) == 1
-		assert service.pending_events[0]['task'] == 'Pre-auth task'
-		assert service.pending_events[0]['user_id'] == TEMP_USER_ID
+		assert hasattr(service.pending_events[0], 'task') and service.pending_events[0].task == 'Pre-auth task'  # type: ignore
+		assert hasattr(service.pending_events[0], 'user_id') and service.pending_events[0].user_id == TEMP_USER_ID  # type: ignore
 
 		# Now authenticate the auth client
 		auth.auth_config.api_token = 'test-api-key'
@@ -503,7 +512,7 @@ class TestCloudSync:
 	async def test_error_handling(self, httpserver: HTTPServer, temp_config_dir):
 		"""Test error handling during event sending."""
 		# Set up server to return 500 error
-		httpserver.expect_request('/api/v1/events/', method='POST').respond_with_data('Internal Server Error', status=500)
+		httpserver.expect_request('/api/v1/events', method='POST').respond_with_data('Internal Server Error', status=500)
 
 		# Create service with real auth
 		auth = DeviceAuthClient(base_url=httpserver.url_for(''))
@@ -515,7 +524,18 @@ class TestCloudSync:
 		service.session_id = 'test-session-id'
 
 		# Send event - should not raise exception but handle gracefully
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Should handle error gracefully without crashing
 
@@ -561,10 +581,8 @@ class TestCloudSync:
 		content = '\n'.join(json.dumps(event) for event in events) + '\n'
 		await anyio.Path(wal_path).write_text(content)
 
-		# Patch BROWSER_USE_CONFIG_DIR to point to our temp directory
-		with patch('browser_use.utils.BROWSER_USE_CONFIG_DIR', temp_config_dir):
-			# Call the method under test
-			await service._update_wal_user_ids(service.session_id)
+		# Call the method under test (temp_config_dir fixture already sets the env var)
+		await service._update_wal_user_ids(service.session_id)
 
 		# Read back the updated file and verify changes
 		content = await anyio.Path(wal_path).read_text()
@@ -645,7 +663,7 @@ class TestIntegration:
 
 		# Set up events endpoint
 		httpserver.expect_request(
-			'/api/v1/events/',
+			'/api/v1/events',
 			method='POST',
 		).respond_with_json({'processed': 1, 'failed': 0})
 
@@ -654,17 +672,36 @@ class TestIntegration:
 		service.session_id = 'test-session-id'
 
 		# Send pre-auth event
-		await service.handle_event(BaseEvent(event_type='CreateAgentSessionEvent', started_at=datetime.utcnow().isoformat()))
+		await service.handle_event(
+			CreateAgentSessionEvent(
+				user_id=TEMP_USER_ID,
+				browser_session_id='test-browser-session',
+				browser_session_live_url='http://example.com/live',
+				browser_session_cdp_url='ws://example.com/cdp',
+			)
+		)
 
 		# Authenticate
 		authenticated = await service.authenticate(show_instructions=False)
 		assert authenticated is True
+		assert service.auth_client is not None
 		assert service.auth_client.is_authenticated
 		assert service.auth_client.api_token == 'test-api-key'
 		assert service.auth_client.user_id == 'test-user-123'
 
 		# Send authenticated event
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Authenticated task'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Authenticated task',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Verify auth was saved
 		auth_file = temp_config_dir / 'cloud_auth.json'
@@ -715,7 +752,7 @@ class TestAuthResilience:
 
 		# Now simulate token expiry by returning 401 errors
 		httpserver.expect_request(
-			'/api/v1/events/',
+			'/api/v1/events',
 			method='POST',
 		).respond_with_json({'error': 'unauthorized', 'detail': 'Token expired'}, status=401)
 
@@ -726,7 +763,18 @@ class TestAuthResilience:
 		service.auth_client = auth
 
 		# Send event - should not raise exception even though token is expired
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task after token expiry'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task after token expiry',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 		# Agent should continue functioning despite sync failure
 		assert True  # No exception raised
@@ -753,12 +801,23 @@ class TestAuthResilience:
 
 		# Set up events endpoint to handle unauthenticated requests
 		httpserver.expect_request(
-			'/api/v1/events/',
+			'/api/v1/events',
 			method='POST',
 		).respond_with_json({'processed': 1, 'failed': 0})
 
 		# Should be able to send events without auth (pre-auth mode)
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task without auth'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task without auth',
+				user_id='',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 	async def test_server_downtime_resilience(self, httpserver: HTTPServer, http_client, temp_config_dir):
 		"""Test that server downtime doesn't break the agent."""
@@ -777,7 +836,18 @@ class TestAuthResilience:
 
 		# Should be able to send events even when server is down
 		# They will be queued locally
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task during server downtime'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task during server downtime',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
 
 	async def test_excessive_event_queue_handling(self, httpserver: HTTPServer, http_client, temp_config_dir):
 		"""Test that excessive event queuing doesn't break the agent."""
@@ -790,7 +860,18 @@ class TestAuthResilience:
 
 		# Send many events while server is down (no responses configured)
 		for i in range(100):
-			await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task=f'Test task {i}'))
+			await service.handle_event(
+				CreateAgentTaskEvent(
+					agent_session_id='test-session',
+					llm_model='test-model',
+					task=f'Test task {i}',
+					user_id='test-user-123',
+					done_output=None,
+					user_feedback_type=None,
+					user_comment=None,
+					gif_url=None,
+				)
+			)
 
 		# Agent should still be functioning
 		assert True  # No memory issues or crashes
@@ -813,7 +894,7 @@ class TestAuthResilience:
 
 		# Set up another malformed response for events
 		httpserver.expect_request(
-			'/api/v1/events/',
+			'/api/v1/events',
 			method='POST',
 		).respond_with_data('malformed response', status=500)
 
@@ -823,4 +904,15 @@ class TestAuthResilience:
 		service.auth_client = auth
 
 		# Should handle malformed event response gracefully
-		await service.handle_event(BaseEvent(event_type='CreateAgentTaskEvent', task='Test task with malformed response'))
+		await service.handle_event(
+			CreateAgentTaskEvent(
+				agent_session_id='test-session',
+				llm_model='test-model',
+				task='Test task with malformed response',
+				user_id='test-user-123',
+				done_output=None,
+				user_feedback_type=None,
+				user_comment=None,
+				gif_url=None,
+			)
+		)
diff --git a/tests/debug_page_structure.py b/tests/debug_page_structure.py
deleted file mode 100644
index 43d9658d9..000000000
--- a/tests/debug_page_structure.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import asyncio
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from browser_use.browser.browser import Browser, BrowserConfig
-from browser_use.browser.context import BrowserContext
-
-
-async def analyze_page_structure(url: str):
-	"""Analyze and print the structure of a webpage with enhanced debugging"""
-	browser = Browser(
-		config=BrowserConfig(
-			headless=False,  # Set to True if you don't need to see the browser
-		),
-		user_data_dir=None,
-	)
-
-	context = BrowserContext(browser=browser)
-
-	try:
-		async with context as ctx:
-			# Navigate to the URL
-			page = await ctx.get_current_page()
-			await page.goto(url)
-			await page.wait_for_load_state('networkidle')
-
-			# Get viewport dimensions
-			viewport_info = await page.evaluate("""() => {
-				return {
-					viewport: {
-						width: window.innerWidth,
-						height: window.innerHeight,
-						scrollX: window.scrollX,
-						scrollY: window.scrollY
-					}
-				}
-			}""")
-
-			print('\nViewport Information:')
-			print(f'Width: {viewport_info["viewport"]["width"]}')
-			print(f'Height: {viewport_info["viewport"]["height"]}')
-			print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}')
-			print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}')
-
-			# Enhanced debug information for cookie consent and fixed position elements
-			debug_info = await page.evaluate("""() => {
-				function getElementInfo(element) {
-					const rect = element.getBoundingClientRect();
-					const style = window.getComputedStyle(element);
-					return {
-						tag: element.tagName.toLowerCase(),
-						id: element.id,
-						className: element.className,
-						position: style.position,
-						rect: {
-							top: rect.top,
-							right: rect.right,
-							bottom: rect.bottom,
-							left: rect.left,
-							width: rect.width,
-							height: rect.height
-						},
-						isFixed: style.position === 'fixed',
-						isSticky: style.position === 'sticky',
-						zIndex: style.zIndex,
-						visibility: style.visibility,
-						display: style.display,
-						opacity: style.opacity
-					};
-				}
-
-				// Find cookie-related elements
-				const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]'));
-				const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => {
-					const style = window.getComputedStyle(el);
-					return style.position === 'fixed' || style.position === 'sticky';
-				});
-
-				return {
-					cookieElements: cookieElements.map(getElementInfo),
-					fixedElements: fixedElements.map(getElementInfo)
-				};
-			}""")
-
-			print('\nCookie-related Elements:')
-			for elem in debug_info['cookieElements']:
-				print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
-				print(f'Position: {elem["position"]}')
-				print(f'Rect: {elem["rect"]}')
-				print(f'Z-Index: {elem["zIndex"]}')
-				print(f'Visibility: {elem["visibility"]}')
-				print(f'Display: {elem["display"]}')
-				print(f'Opacity: {elem["opacity"]}')
-
-			print('\nFixed/Sticky Position Elements:')
-			for elem in debug_info['fixedElements']:
-				print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
-				print(f'Position: {elem["position"]}')
-				print(f'Rect: {elem["rect"]}')
-				print(f'Z-Index: {elem["zIndex"]}')
-
-			print(f'\nPage Structure for {url}:\n')
-			structure = await ctx.get_page_structure()
-			print(structure)
-
-			input('Press Enter to close the browser...')
-	finally:
-		await browser.close()
-
-
-if __name__ == '__main__':
-	# You can modify this URL to analyze different pages
-
-	urls = [
-		'https://www.mlb.com/yankees/stats/',
-		'https://immobilienscout24.de',
-		'https://www.zeiss.com/career/en/job-search.html?page=1',
-		'https://www.zeiss.com/career/en/job-search.html?page=1',
-		'https://reddit.com',
-	]
-	for url in urls:
-		asyncio.run(analyze_page_structure(url))
diff --git a/tests/extraction_test.py b/tests/old/extraction_test.py
similarity index 93%
rename from tests/extraction_test.py
rename to tests/old/extraction_test.py
index 38890d8c9..31c125ae6 100644
--- a/tests/extraction_test.py
+++ b/tests/old/extraction_test.py
@@ -7,6 +7,7 @@ from langchain_openai import ChatOpenAI
 from browser_use.agent.prompts import AgentMessagePrompt
 from browser_use.browser import BrowserProfile, BrowserSession
 from browser_use.dom.service import DomService
+from browser_use.filesystem.file_system import FileSystem
 
 
 def count_string_tokens(string: str, model: str) -> tuple[int, float]:
@@ -101,7 +102,7 @@ async def test_focus_vs_all_elements():
 				# print(all_elements_state.element_tree.clickable_elements_to_string())
 				prompt = AgentMessagePrompt(
 					browser_state_summary=all_elements_state,
-					result=None,
+					file_system=FileSystem(working_dir='./tmp'),
 					include_attributes=DEFAULT_INCLUDE_ATTRIBUTES,
 					step_info=None,
 				)
@@ -110,9 +111,15 @@ async def test_focus_vs_all_elements():
 				user_message = prompt.get_user_message(use_vision=False).content
 				os.makedirs('./tmp', exist_ok=True)
 				async with await anyio.open_file('./tmp/user_message.txt', 'w', encoding='utf-8') as f:
-					await f.write(user_message)
+					if isinstance(user_message, str):
+						await f.write(user_message)
+					else:
+						await f.write(str(user_message))
 
-				token_count, price = count_string_tokens(user_message, model='gpt-4o')
+				if isinstance(user_message, str):
+					token_count, price = count_string_tokens(user_message, model='gpt-4o')
+				else:
+					token_count, price = count_string_tokens(str(user_message), model='gpt-4o')
 				print(f'Prompt token count: {token_count}, price: {round(price, 4)} USD')
 				print('User message written to ./tmp/user_message.txt')
 
diff --git a/tests/httpx_client_test.py b/tests/old/httpx_client_test.py
similarity index 100%
rename from tests/httpx_client_test.py
rename to tests/old/httpx_client_test.py
diff --git a/tests/process_dom_test.py b/tests/old/process_dom_test.py
similarity index 100%
rename from tests/process_dom_test.py
rename to tests/old/process_dom_test.py
diff --git a/tests/screenshot_test.py b/tests/old/screenshot_test.py
similarity index 100%
rename from tests/screenshot_test.py
rename to tests/old/screenshot_test.py
diff --git a/tests/sync_live.py b/tests/old/sync_live.py
similarity index 94%
rename from tests/sync_live.py
rename to tests/old/sync_live.py
index 24a51e72d..6d39cc760 100644
--- a/tests/sync_live.py
+++ b/tests/old/sync_live.py
@@ -17,7 +17,7 @@
 # from browser_use import Agent
 # from browser_use.browser.browser import BrowserSession
 # from browser_use.sync.service import CloudSync
-# from tests.ci.mocks import create_mock_llm
+# from tests.ci.conftest import create_mock_llm
 
 # logger = logging.getLogger(__name__)
 
@@ -77,11 +77,11 @@
 
 # 	Environment variables required:
 # 	- RUN_LIVE_TESTS=1 (to enable the test)
-# 	- BROWSER_USE_CLOUD_URL (optional, defaults to https://cloud.browser-use.com)
+# 	- BROWSER_USE_CLOUD_API_URL (optional, defaults to https://cloud.browser-use.com)
 # 	"""
 
 # 	# Configuration
-# 	backend_url = os.getenv('BROWSER_USE_CLOUD_URL', 'http://localhost:8000')
+# 	backend_url = os.getenv('BROWSER_USE_CLOUD_API_URL', 'http://localhost:8000')
 # 	logger.info(f'Running live integration test against: {backend_url}')
 
 # 	# Create mock LLM
@@ -89,7 +89,7 @@
 
 # 	# Set environment variables for cloud sync
 # 	os.environ['BROWSERUSE_CLOUD_SYNC'] = 'true'
-# 	os.environ['BROWSER_USE_CLOUD_URL'] = backend_url
+# 	os.environ['BROWSER_USE_CLOUD_API_URL'] = backend_url
 
 # 	# Create browser session with real profile
 # 	browser_session = BrowserSession(
@@ -147,7 +147,7 @@
 # 	This is a simpler test that just verifies event sending works.
 # 	"""
 
-# 	backend_url = os.getenv('BROWSER_USE_CLOUD_URL', 'http://localhost:8000')
+# 	backend_url = os.getenv('BROWSER_USE_CLOUD_API_URL', 'http://localhost:8000')
 # 	logger.info(f'Testing cloud sync against: {backend_url}')
 
 # 	# Create cloud sync service
diff --git a/tests/test_action_filters.py b/tests/old/test_action_filters.py
similarity index 100%
rename from tests/test_action_filters.py
rename to tests/old/test_action_filters.py
diff --git a/tests/test_agent_actions.py b/tests/old/test_agent_actions.py
similarity index 97%
rename from tests/test_agent_actions.py
rename to tests/old/test_agent_actions.py
index bdd276b16..721be1e41 100644
--- a/tests/test_agent_actions.py
+++ b/tests/old/test_agent_actions.py
@@ -25,10 +25,10 @@ def llm():
 
 @pytest.fixture
 async def browser_session():
-	browser_session = BrowserSession(
-		headless=True,
-		user_data_dir=None,
-	)
+	from browser_use.browser.profile import BrowserProfile
+
+	profile = BrowserProfile(headless=True, user_data_dir=None)
+	browser_session = BrowserSession(browser_profile=profile)
 	await browser_session.start()
 	yield browser_session
 	await browser_session.stop()
diff --git a/tests/test_clicks.py b/tests/old/test_clicks.py
similarity index 100%
rename from tests/test_clicks.py
rename to tests/old/test_clicks.py
diff --git a/tests/test_core_functionality.py b/tests/old/test_core_functionality.py
similarity index 98%
rename from tests/test_core_functionality.py
rename to tests/old/test_core_functionality.py
index 438e39f6f..47ede5266 100644
--- a/tests/test_core_functionality.py
+++ b/tests/old/test_core_functionality.py
@@ -68,7 +68,10 @@ class TestCoreFunctionality:
 	@pytest.fixture(scope='module')
 	async def browser_session(self):
 		"""Create and provide a BrowserSession instance with security disabled."""
-		browser_session = BrowserSession(headless=True, user_data_dir=None)
+		from browser_use.browser.profile import BrowserProfile
+
+		profile = BrowserProfile(headless=True, user_data_dir=None)
+		browser_session = BrowserSession(browser_profile=profile)
 		yield browser_session
 		await browser_session.kill()
 
diff --git a/tests/test_dropdown.py b/tests/old/test_dropdown.py
similarity index 100%
rename from tests/test_dropdown.py
rename to tests/old/test_dropdown.py
diff --git a/tests/test_dropdown_complex.py b/tests/old/test_dropdown_complex.py
similarity index 100%
rename from tests/test_dropdown_complex.py
rename to tests/old/test_dropdown_complex.py
diff --git a/tests/test_dropdown_error.py b/tests/old/test_dropdown_error.py
similarity index 100%
rename from tests/test_dropdown_error.py
rename to tests/old/test_dropdown_error.py
diff --git a/tests/test_excluded_actions.py b/tests/old/test_excluded_actions.py
similarity index 92%
rename from tests/test_excluded_actions.py
rename to tests/old/test_excluded_actions.py
index 193a05a28..ad43265ea 100644
--- a/tests/test_excluded_actions.py
+++ b/tests/old/test_excluded_actions.py
@@ -21,10 +21,10 @@ class MockLLM:
 
 @pytest.fixture(scope='module')
 async def browser_session():
-	browser_session = BrowserSession(
-		headless=True,
-		user_data_dir=None,
-	)
+	from browser_use.browser.profile import BrowserProfile
+
+	profile = BrowserProfile(headless=True, user_data_dir=None)
+	browser_session = BrowserSession(browser_profile=profile)
 	await browser_session.start()
 	yield browser_session
 	await browser_session.stop()
diff --git a/tests/test_full_screen.py b/tests/old/test_full_screen.py
similarity index 100%
rename from tests/test_full_screen.py
rename to tests/old/test_full_screen.py
diff --git a/tests/test_gif_path.py b/tests/old/test_gif_path.py
similarity index 100%
rename from tests/test_gif_path.py
rename to tests/old/test_gif_path.py
diff --git a/tests/test_google_sheets_real.py b/tests/old/test_google_sheets_real.py
similarity index 100%
rename from tests/test_google_sheets_real.py
rename to tests/old/test_google_sheets_real.py
diff --git a/tests/test_mind2web.py b/tests/old/test_mind2web.py
similarity index 100%
rename from tests/test_mind2web.py
rename to tests/old/test_mind2web.py
diff --git a/tests/test_models.py b/tests/old/test_models.py
similarity index 100%
rename from tests/test_models.py
rename to tests/old/test_models.py
diff --git a/tests/test_qwen.py b/tests/old/test_qwen.py
similarity index 100%
rename from tests/test_qwen.py
rename to tests/old/test_qwen.py
diff --git a/tests/test_react_dropdown.py b/tests/old/test_react_dropdown.py
similarity index 100%
rename from tests/test_react_dropdown.py
rename to tests/old/test_react_dropdown.py
diff --git a/tests/test_self_registered_actions.py b/tests/old/test_self_registered_actions.py
similarity index 100%
rename from tests/test_self_registered_actions.py
rename to tests/old/test_self_registered_actions.py
diff --git a/tests/test_vision.py b/tests/old/test_vision.py
similarity index 89%
rename from tests/test_vision.py
rename to tests/old/test_vision.py
index 7886377bc..f8affde31 100644
--- a/tests/test_vision.py
+++ b/tests/old/test_vision.py
@@ -40,7 +40,10 @@ async def done(text: str) -> str:
 
 @pytest.mark.skip(reason='this is for local testing only')
 async def test_vision():
-	browser_session = BrowserSession(headless=True, user_data_dir=None)
+	from browser_use.browser.profile import BrowserProfile
+
+	profile = BrowserProfile(headless=True, user_data_dir=None)
+	browser_session = BrowserSession(browser_profile=profile)
 	await browser_session.start()
 	try:
 		agent = Agent(
diff --git a/tests/test_wait_for_element.py b/tests/old/test_wait_for_element.py
similarity index 100%
rename from tests/test_wait_for_element.py
rename to tests/old/test_wait_for_element.py
diff --git a/tests/test_action_params.py b/tests/test_action_params.py
deleted file mode 100644
index d8f051065..000000000
--- a/tests/test_action_params.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import asyncio
-import logging
-from inspect import signature
-
-import pytest
-from pydantic import BaseModel, Field
-
-from browser_use.browser import BrowserSession
-from browser_use.controller.registry.service import Registry
-from browser_use.controller.registry.views import ActionModel
-
-# Configure logging
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger(__name__)
-
-
-# Test model - renamed to avoid pytest collection warnings
-class TestActionParamsModel(ActionModel):
-	value: str = Field(description='Test value')
-
-
-# Our Context type for the Registry - renamed to avoid pytest collection warnings
-class TestContextHelper:
-	def __init__(self, value):
-		self.value = value
-
-
-@pytest.mark.asyncio
-async def test_registry_param_handling():
-	"""Test how Registry handles parameter passing for different function signatures."""
-	# Create a Registry instance
-	registry = Registry[TestContextHelper]()
-
-	# Create test functions with different signatures
-
-	# 1. Function with browser_session as a positional parameter
-	@registry.action('Test action with browser_session', param_model=TestActionParamsModel)
-	async def action_with_browser_session(params: TestActionParamsModel, browser_session: BrowserSession):
-		logger.debug(f'action_with_browser_session called with params={params}, browser_session={browser_session}')
-		return {'params': params.model_dump(), 'has_browser': browser_session is not None}
-
-	# 2. Function with browser_session in the model
-	class ModelWithBrowserSession(BaseModel):
-		value: str
-		browser_session: BrowserSession = None
-
-	@registry.action('Test action with browser_session in model')
-	async def action_with_browser_in_model(params: ModelWithBrowserSession):
-		logger.debug(f'action_with_browser_in_model called with params={params}')
-		return {'params': params.model_dump(), 'has_browser': params.browser_session is not None}
-
-	# 3. Function using **kwargs
-	@registry.action('Test action with kwargs')
-	async def action_with_kwargs(params: TestActionParamsModel, **kwargs):
-		logger.debug(f'action_with_kwargs called with params={params}, kwargs={kwargs}')
-		return {'params': params.model_dump(), 'kwargs': kwargs}
-
-	# Create a mock browser session
-	mock_browser_session = object()  # Just a placeholder
-
-	# Execute the actions
-	logger.debug('\n\n=== Testing action_with_browser_session ===')
-	result1 = await registry.execute_action(
-		'action_with_browser_session', {'value': 'test1'}, browser_session=mock_browser_session
-	)
-	logger.debug(f'Result: {result1}')
-
-	logger.debug('\n\n=== Testing action_with_browser_in_model ===')
-	result2 = await registry.execute_action(
-		'action_with_browser_in_model',
-		{'value': 'test2', 'browser_session': None},  # Browser session in model is None
-		browser_session=mock_browser_session,  # Browser session in execute_action is provided
-	)
-	logger.debug(f'Result: {result2}')
-
-	logger.debug('\n\n=== Testing action_with_kwargs ===')
-	result3 = await registry.execute_action('action_with_kwargs', {'value': 'test3'}, browser_session=mock_browser_session)
-	logger.debug(f'Result: {result3}')
-
-	# Print all signatures
-	logger.debug('\n\n=== Function Signatures ===')
-	logger.debug(f'action_with_browser_session: {signature(action_with_browser_session)}')
-	logger.debug(f'action_with_browser_in_model: {signature(action_with_browser_in_model)}')
-	logger.debug(f'action_with_kwargs: {signature(action_with_kwargs)}')
-
-	return result1, result2, result3
-
-
-if __name__ == '__main__':
-	# Run the test
-	asyncio.run(test_registry_param_handling())
diff --git a/tests/test_service.py b/tests/test_service.py
deleted file mode 100644
index dd7a3cbe3..000000000
--- a/tests/test_service.py
+++ /dev/null
@@ -1,333 +0,0 @@
-from unittest.mock import AsyncMock, MagicMock, Mock, patch
-
-import pytest
-from langchain_core.language_models.chat_models import BaseChatModel
-from langchain_core.messages import HumanMessage
-from pydantic import BaseModel
-
-from browser_use.agent.service import Agent
-from browser_use.agent.views import ActionResult
-from browser_use.browser import BrowserSession
-from browser_use.browser.views import BrowserStateSummary
-from browser_use.controller.registry.service import Registry
-from browser_use.controller.registry.views import ActionModel
-from browser_use.controller.service import Controller
-
-# run with python -m pytest tests/test_service.py
-
-
-# run test with:
-# python -m pytest tests/test_service.py
-class TestAgent:
-	@pytest.fixture
-	def mock_controller(self):
-		controller = Mock(spec=Controller)
-		registry = Mock(spec=Registry)
-		registry.registry = MagicMock()
-		registry.registry.actions = {'test_action': MagicMock(param_model=MagicMock())}  # type: ignore
-		controller.registry = registry
-		return controller
-
-	@pytest.fixture
-	def mock_llm(self):
-		return Mock(spec=BaseChatModel)
-
-	@pytest.fixture
-	def mock_browser_session(self):
-		return Mock(spec=BrowserSession)
-
-	def test_convert_initial_actions(self, mock_controller, mock_llm, mock_browser_session):  # type: ignore
-		"""
-		Test that the _convert_initial_actions method correctly converts
-		dictionary-based actions to ActionModel instances.
-
-		This test ensures that:
-		1. The method processes the initial actions correctly.
-		2. The correct param_model is called with the right parameters.
-		3. The ActionModel is created with the validated parameters.
-		4. The method returns a list of ActionModel instances.
-		"""
-		# Arrange
-		agent = Agent(task='Test task', llm=mock_llm, controller=mock_controller, browser_session=mock_browser_session)
-		initial_actions = [{'test_action': {'param1': 'value1', 'param2': 'value2'}}]
-
-		# Mock the ActionModel
-		mock_action_model = MagicMock(spec=ActionModel)
-		mock_action_model_instance = MagicMock()
-		mock_action_model.return_value = mock_action_model_instance
-		agent.ActionModel = mock_action_model  # type: ignore
-
-		# Act
-		result = agent._convert_initial_actions(initial_actions)
-
-		# Assert
-		assert len(result) == 1
-		mock_controller.registry.registry.actions['test_action'].param_model.assert_called_once_with(  # type: ignore
-			param1='value1', param2='value2'
-		)
-		mock_action_model.assert_called_once()
-		assert isinstance(result[0], MagicMock)
-		assert result[0] == mock_action_model_instance
-
-		# Check that the ActionModel was called with the correct parameters
-		call_args = mock_action_model.call_args[1]
-		assert 'test_action' in call_args
-		assert call_args['test_action'] == mock_controller.registry.registry.actions['test_action'].param_model.return_value  # type: ignore
-
-	async def test_step_error_handling(self):
-		"""
-		Test the error handling in the step method of the Agent class.
-		This test simulates a failure in the get_next_action method and
-		checks if the error is properly handled and recorded.
-		"""
-		# Mock the LLM
-		mock_llm = MagicMock(spec=BaseChatModel)
-
-		# Mock the MessageManager
-		with patch('browser_use.agent.service.MessageManager') as mock_message_manager:
-			# Create an Agent instance with mocked dependencies
-			agent = Agent(task='Test task', llm=mock_llm)
-
-			# Mock the get_next_action method to raise an exception
-			agent.get_next_action = AsyncMock(side_effect=ValueError('Test error'))
-
-			# Mock the browser_session
-			agent.browser_session = AsyncMock()
-			agent.browser_session.get_state_summary = AsyncMock(
-				return_value=BrowserStateSummary(
-					url='https://example.com',
-					title='Example',
-					element_tree=MagicMock(),  # Mocked element tree
-					tabs=[],
-					selector_map={},
-					screenshot='',
-				)
-			)
-
-			# Mock the controller
-			agent.controller = AsyncMock()
-
-			# Call the step method
-			await agent.step()
-
-			# Assert that the error was handled and recorded
-			assert agent.consecutive_failures == 1
-			assert len(agent._last_result) == 1
-			assert isinstance(agent._last_result[0], ActionResult)
-			assert 'Test error' in agent._last_result[0].error
-			assert agent._last_result[0].include_in_memory is True
-
-
-class TestRegistry:
-	@pytest.fixture
-	def registry_with_excludes(self):
-		return Registry(exclude_actions=['excluded_action'])
-
-	def test_action_decorator_with_excluded_action(self, registry_with_excludes):
-		"""
-		Test that the action decorator does not register an action
-		if it's in the exclude_actions list.
-		"""
-
-		# Define a function to be decorated
-		def excluded_action():
-			pass
-
-		# Apply the action decorator
-		decorated_func = registry_with_excludes.action(description='This should be excluded')(excluded_action)
-
-		# Assert that the decorated function is the same as the original
-		assert decorated_func == excluded_action
-
-		# Assert that the action was not added to the registry
-		assert 'excluded_action' not in registry_with_excludes.registry.actions
-
-		# Define another function that should be included
-		def included_action():
-			pass
-
-		# Apply the action decorator to an included action
-		registry_with_excludes.action(description='This should be included')(included_action)
-
-		# Assert that the included action was added to the registry
-		assert 'included_action' in registry_with_excludes.registry.actions
-
-	async def test_execute_action_with_and_without_browser_context(self):
-		"""
-		Test that the execute_action method correctly handles actions with and without a browser context.
-		This test ensures that:
-		1. An action requiring a browser context is executed correctly.
-		2. An action not requiring a browser context is executed correctly.
-		3. The browser context is passed to the action function when required.
-		4. The action function receives the correct parameters.
-		5. The method raises an error when a browser context is required but not provided.
-		"""
-		registry = Registry()
-
-		# Define a mock action model
-		class TestActionModel(BaseModel):
-			param1: str
-
-		# Define mock action functions
-		async def test_action_with_browser(param1: str, browser):
-			return f'Action executed with {param1} and browser'
-
-		async def test_action_without_browser(param1: str):
-			return f'Action executed with {param1}'
-
-		# Register the actions
-		registry.registry.actions['test_action_with_browser'] = MagicMock(
-			function=AsyncMock(side_effect=test_action_with_browser),
-			param_model=TestActionModel,
-			description='Test action with browser',
-		)
-
-		registry.registry.actions['test_action_without_browser'] = MagicMock(
-			function=AsyncMock(side_effect=test_action_without_browser),
-			param_model=TestActionModel,
-			description='Test action without browser',
-		)
-
-		# Mock BrowserContext
-		mock_browser = MagicMock()
-
-		# Execute the action with a browser context
-		result_with_browser = await registry.execute_action(
-			'test_action_with_browser', {'param1': 'test_value'}, browser=mock_browser
-		)
-		assert result_with_browser == 'Action executed with test_value and browser'
-
-		# Execute the action without a browser context
-		result_without_browser = await registry.execute_action('test_action_without_browser', {'param1': 'test_value'})
-		assert result_without_browser == 'Action executed with test_value'
-
-		# Test error when browser is required but not provided
-		with pytest.raises(RuntimeError, match='Action test_action_with_browser requires browser but none provided'):
-			await registry.execute_action('test_action_with_browser', {'param1': 'test_value'})
-
-		# Verify that the action functions were called with correct parameters
-		registry.registry.actions['test_action_with_browser'].function.assert_called_once_with(
-			param1='test_value', browser=mock_browser
-		)
-		registry.registry.actions['test_action_without_browser'].function.assert_called_once_with(param1='test_value')
-
-
-class TestAgentRetry:
-	@pytest.fixture
-	def mock_llm(self):
-		return AsyncMock()
-
-	@pytest.fixture
-	def mock_controller(self):
-		controller = Mock()
-		controller.registry = Mock()
-		controller.registry.registry = Mock()
-		controller.registry.registry.actions = {}
-		return controller
-
-	@pytest.fixture
-	def mock_browser_session(self):
-		browser_session = Mock()
-		browser_session.get_state_summary = AsyncMock(
-			return_value=BrowserStateSummary(
-				url='https://parabank.parasoft.com/parabank/index.htm',
-				title='ParaBank',
-				element_tree=MagicMock(),
-				tabs=[],
-				selector_map={},
-				screenshot='',
-			)
-		)
-		return browser_session
-
-	@pytest.fixture
-	def mock_action_model(self):
-		action_model = Mock(spec=ActionModel)
-		return action_model
-
-	@pytest.mark.asyncio
-	async def test_step_empty_action_retry(self, mock_llm, mock_controller, mock_browser_session, mock_action_model):
-		"""
-		Test that the step method retries and handles empty actions correctly.
-		"""
-		# Arrange
-		agent = Agent(
-			task='Test task',
-			llm=mock_llm,
-			controller=mock_controller,
-			browser_session=mock_browser_session,
-		)
-		agent.ActionModel = mock_action_model  # Inject the mock ActionModel
-
-		# Mock get_next_action to return empty action the first time, then a valid action
-		empty_model_output = MagicMock()
-		empty_model_output.action = []  # Empty action
-		valid_model_output = MagicMock()
-		valid_action = MagicMock()
-		valid_model_output.action = [valid_action]
-
-		mock_llm.return_value.invoke.side_effect = [empty_model_output, valid_model_output]
-		agent.get_next_action = mock_llm.return_value.invoke
-
-		# Act
-		await agent.step()
-
-		# Assert
-		# Check that get_next_action was called twice (initial call + retry)
-		assert agent.get_next_action.call_count == 2
-		# Check that the LLM was called twice
-		assert mock_llm.return_value.invoke.call_count == 2
-
-		# Check that the second call to get_next_action included the clarification message
-		_, retry_messages = mock_llm.return_value.invoke.call_args_list[1]
-		assert len(retry_messages[0]) == 2  # input_messages + clarification message
-		assert isinstance(retry_messages[0][1], HumanMessage)
-		assert 'You forgot to return an action' in retry_messages[0][1].content
-
-		# Check that _last_result contains the valid action
-		assert len(agent._last_result) == 1
-		assert agent._last_result[0].action == valid_action
-
-	@pytest.mark.asyncio
-	async def test_step_empty_action_retry_and_fail(self, mock_llm, mock_controller, mock_browser_session, mock_action_model):
-		"""
-		Test that the step method handles the case where get_next_action returns
-		empty actions twice, and inserts a safe noop action.
-		"""
-		# Arrange
-		agent = Agent(
-			task='Test task',
-			llm=mock_llm,
-			controller=mock_controller,
-			browser_session=mock_browser_session,
-		)
-		agent.ActionModel = mock_action_model  # Inject the mock ActionModel
-
-		# Mock get_next_action to return empty action both times
-		empty_model_output = MagicMock()
-		empty_model_output.action = []  # Empty action
-		mock_llm.return_value.invoke.return_value = empty_model_output
-		agent.get_next_action = mock_llm.return_value.invoke
-
-		# Mock the ActionModel instance creation
-		mock_action_instance = MagicMock()
-		mock_action_model.return_value = mock_action_instance
-
-		# Act
-		await agent.step()
-
-		# Assert
-		# Check that get_next_action was called twice
-		assert agent.get_next_action.call_count == 2
-		# Check that the LLM was called twice
-		assert mock_llm.return_value.invoke.call_count == 2
-
-		# Check that ActionModel was instantiated with the noop action
-		mock_action_model.assert_called_once()
-		call_args = mock_action_model.call_args[1]
-		assert 'done' in call_args
-		assert call_args['done'] == {'success': False, 'text': 'No action returned, safe exit.'}
-
-		# Check that _last_result contains the noop action
-		assert len(agent._last_result) == 1
-		assert agent._last_result[0].action == mock_action_instance
diff --git a/tests/test_stress.py b/tests/test_stress.py
deleted file mode 100644
index 1b00bb3dc..000000000
--- a/tests/test_stress.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import os
-import random
-import string
-import time
-
-import pytest
-from langchain_openai import AzureChatOpenAI
-from pydantic import SecretStr
-
-from browser_use.agent.service import Agent
-from browser_use.browser import BrowserProfile, BrowserSession
-from browser_use.controller.service import Controller
-
-
-@pytest.fixture
-async def browser_session():
-	browser_session = BrowserSession(
-		browser_profile=BrowserProfile(
-			headless=True,
-		)
-	)
-	await browser_session.start()
-	yield browser_session
-	await browser_session.stop()
-
-
-@pytest.fixture
-def llm():
-	"""Initialize the language model"""
-	model = AzureChatOpenAI(
-		api_version='2024-10-21',
-		model='gpt-4o',
-		azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
-		api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
-	)
-	return model
-
-
-def generate_random_text(length: int) -> str:
-	"""Generate random text of specified length"""
-	return ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
-
-
-@pytest.fixture
-async def controller():
-	"""Initialize the controller"""
-	controller = Controller()
-	large_text = generate_random_text(10000)
-
-	@controller.action('call this magical function to get very special text')
-	def get_very_special_text():
-		return large_text
-
-	yield controller
-
-
-async def test_token_limit_with_multiple_extractions(llm, controller, browser_session):
-	"""Test handling of multiple smaller extractions accumulating tokens"""
-	agent = Agent(
-		task='Call the magical function to get very special text 5 times',
-		llm=llm,
-		controller=controller,
-		browser_session=browser_session,
-		max_input_tokens=2000,
-		save_conversation_path='tmp/stress_test/test_token_limit_with_multiple_extractions.json',
-	)
-
-	history = await agent.run(max_steps=5)
-
-	# check if 5 times called get_special_text
-	calls = [a for a in history.action_names() if a == 'get_very_special_text']
-	assert len(calls) == 5
-	# check the message history should be max 3 messages
-	assert len(agent.message_manager.history.messages) > 3
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize('max_tokens', [4000])  # 8000 20000
-async def test_open_3_tabs_and_extract_content(llm, controller, browser_session, max_tokens):
-	"""Stress test: Open 3 tabs with urls and extract content"""
-	agent = Agent(
-		task='Open 3 tabs with https://en.wikipedia.org/wiki/Internet and extract the content from each.',
-		llm=llm,
-		controller=controller,
-		browser_session=browser_session,
-		max_input_tokens=max_tokens,
-		save_conversation_path='tmp/stress_test/test_open_3_tabs_and_extract_content.json',
-	)
-	start_time = time.time()
-	history = await agent.run(max_steps=7)
-	end_time = time.time()
-
-	total_time = end_time - start_time
-
-	print(f'Total time: {total_time:.2f} seconds')
-	# Check for errors
-	errors = history.errors()
-	assert len(errors) == 0, 'Errors occurred during the test'
-	# check if 3 tabs were opened
-	current_state = await browser_session.get_state_summary()
-	assert len(current_state.tabs) >= 3, '3 tabs were not opened'