Merge branch 'main' into local-remote-split

2026-05-06 17:52:15 +02:00 · 2025-08-04 19:44:05 -07:00
parent cdf941b0e8 3cf6811d54
commit 032f314a41
22 changed files with 359 additions and 232 deletions
--- a/browser_use/agent/cloud_events.py
+++ b/browser_use/agent/cloud_events.py
@@ -37,7 +37,7 @@ class UpdateAgentTaskEvent(BaseEvent):
 		if not hasattr(agent, '_task_start_time'):
 			raise ValueError('Agent must have _task_start_time attribute')

-		done_output = agent.state.history.final_result() if agent.state.history else None
+		done_output = agent.history.final_result() if agent.history else None
 		return cls(
 			id=str(agent.task_id),
 			user_id='',  # To be filled by cloud handler
@@ -47,7 +47,7 @@ class UpdateAgentTaskEvent(BaseEvent):
 			stopped=agent.state.stopped if hasattr(agent.state, 'stopped') else False,
 			paused=agent.state.paused if hasattr(agent.state, 'paused') else False,
 			done_output=done_output,
-			finished_at=datetime.now(timezone.utc) if agent.state.history and agent.state.history.is_done() else None,
+			finished_at=datetime.now(timezone.utc) if agent.history and agent.history.is_done() else None,
 			agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
 			user_feedback_type=None,
 			user_comment=None,
--- a/browser_use/agent/gif.py
+++ b/browser_use/agent/gif.py
@@ -61,28 +61,22 @@ def create_history_gif(
 		logger.warning('No history to create GIF from')
 		return

+	# Get all screenshots from history (including None placeholders)
+	screenshots = history.screenshots(return_none_if_not_screenshot=True)
+
+	if not screenshots:
+		logger.warning('No screenshots found in history')
+		return
+
 	# Find the first non-placeholder screenshot
 	# A screenshot is considered a placeholder if:
 	# 1. It's the exact 4px placeholder for about:blank pages, OR
 	# 2. It comes from a new tab page (chrome://newtab/, about:blank, etc.)
 	first_real_screenshot = None
-	for item in history.history:
-		if not item.state.screenshot:
-			continue
-
-		# Skip exact placeholder screenshots
-		if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
-			continue
-
-		# Skip screenshots from new tab pages
-		from browser_use.utils import is_new_tab_page
-
-		if is_new_tab_page(item.state.url):
-			continue
-
-		# This is a real screenshot from actual web content
-		first_real_screenshot = item.state.screenshot
-		break
+	for screenshot in screenshots:
+		if screenshot and screenshot != PLACEHOLDER_4PX_SCREENSHOT:
+			first_real_screenshot = screenshot
+			break

 	if not first_real_screenshot:
 		logger.warning('No valid screenshots found (all are placeholders or from new tab pages)')
@@ -145,8 +139,9 @@ def create_history_gif(
 		# Find the first non-placeholder screenshot for the task frame
 		first_real_screenshot = None
 		for item in history.history:
-			if item.state.screenshot and item.state.screenshot != PLACEHOLDER_4PX_SCREENSHOT:
-				first_real_screenshot = item.state.screenshot
+			screenshot_b64 = item.state.get_screenshot()
+			if screenshot_b64 and screenshot_b64 != PLACEHOLDER_4PX_SCREENSHOT:
+				first_real_screenshot = screenshot_b64
 				break

 		if first_real_screenshot:
@@ -162,14 +157,14 @@ def create_history_gif(
 		else:
 			logger.warning('No real screenshots found for task frame, skipping task frame')

-	# Process each history item
-	for i, item in enumerate(history.history, 1):
-		if not item.state.screenshot:
+	# Process each history item with its corresponding screenshot
+	for i, (item, screenshot) in enumerate(zip(history.history, screenshots), 1):
+		if not screenshot:
 			continue

 		# Skip placeholder screenshots from about:blank pages
 		# These are 4x4 white PNGs encoded as a specific base64 string
-		if item.state.screenshot == PLACEHOLDER_4PX_SCREENSHOT:
+		if screenshot == PLACEHOLDER_4PX_SCREENSHOT:
 			logger.debug(f'Skipping placeholder screenshot from about:blank page at step {i}')
 			continue

@@ -181,7 +176,7 @@ def create_history_gif(
 			continue

 		# Convert base64 screenshot to PIL Image
-		img_data = base64.b64decode(item.state.screenshot)
+		img_data = base64.b64decode(screenshot)
 		image = Image.open(io.BytesIO(img_data))

 		if show_goals and item.model_output:
--- a/browser_use/agent/message_manager/service.py
+++ b/browser_use/agent/message_manager/service.py
@@ -9,7 +9,6 @@ from browser_use.agent.message_manager.views import (
 from browser_use.agent.prompts import AgentMessagePrompt
 from browser_use.agent.views import (
 	ActionResult,
-	AgentHistoryList,
 	AgentOutput,
 	AgentStepInfo,
 	MessageManagerState,
@@ -104,10 +103,8 @@ class MessageManager:
 		state: MessageManagerState = MessageManagerState(),
 		use_thinking: bool = True,
 		include_attributes: list[str] | None = None,
-		message_context: str | None = None,
 		sensitive_data: dict[str, str | dict[str, str]] | None = None,
 		max_history_items: int | None = None,
-		images_per_step: int = 1,
 		vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
 		include_tool_call_examples: bool = False,
 	):
@@ -118,7 +115,6 @@ class MessageManager:
 		self.sensitive_data_description = ''
 		self.use_thinking = use_thinking
 		self.max_history_items = max_history_items
-		self.images_per_step = images_per_step
 		self.vision_detail_level = vision_detail_level
 		self.include_tool_call_examples = include_tool_call_examples

@@ -126,7 +122,6 @@ class MessageManager:

 		# Store settings as direct attributes instead of in a settings object
 		self.include_attributes = include_attributes or []
-		self.message_context = message_context
 		self.sensitive_data = sensitive_data
 		self.last_input_messages = []
 		# Only initialize messages if state is empty
@@ -260,7 +255,6 @@ class MessageManager:
 		use_vision=True,
 		page_filtered_actions: str | None = None,
 		sensitive_data=None,
-		agent_history_list: AgentHistoryList | None = None,  # Pass AgentHistoryList from agent
 		available_file_paths: list[str] | None = None,  # Always pass current available_file_paths
 	) -> None:
 		"""Add browser state as human message"""
@@ -269,14 +263,8 @@ class MessageManager:
 		if sensitive_data:
 			self.sensitive_data_description = self._get_sensitive_data_description(browser_state_summary.url)

-		# Extract previous screenshots if we need more than 1 image and have agent history
+		# Use only the current screenshot
 		screenshots = []
-		if agent_history_list and self.images_per_step > 1:
-			# Get previous screenshots and filter out None values
-			raw_screenshots = agent_history_list.screenshots(n_last=self.images_per_step - 1, return_none_if_not_screenshot=False)
-			screenshots = [s for s in raw_screenshots if s is not None]
-
-		# add current screenshot to the end
 		if browser_state_summary.screenshot:
 			screenshots.append(browser_state_summary.screenshot)

--- a/browser_use/agent/prompts.py
+++ b/browser_use/agent/prompts.py
@@ -108,36 +108,6 @@ class AgentMessagePrompt:
 		self.vision_detail_level = vision_detail_level
 		assert self.browser_state

-	@observe_debug(ignore_input=True, ignore_output=True, name='_deduplicate_screenshots')
-	def _deduplicate_screenshots(self, screenshots: list[str]) -> list[str]:
-		"""
-		Remove consecutive duplicate screenshots, keeping only the most recent of each.
-
-		Args:
-			screenshots: List of base64-encoded screenshot strings in chronological order (oldest first)
-
-		Returns:
-			List of screenshots with consecutive duplicates removed, maintaining chronological order
-		"""
-		if not screenshots:
-			return []
-
-		if len(screenshots) == 1:
-			return screenshots
-
-		# Keep track of unique screenshots by comparing each with the next one
-		unique_screenshots = []
-
-		for i in range(len(screenshots)):
-			# Always keep the last screenshot
-			if i == len(screenshots) - 1:
-				unique_screenshots.append(screenshots[i])
-			# Only keep screenshot if it's different from the next one
-			elif screenshots[i] != screenshots[i + 1]:
-				unique_screenshots.append(screenshots[i])
-
-		return unique_screenshots
-
 	@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
 	def _get_browser_state_description(self) -> str:
 		elements_text = self.browser_state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
@@ -277,12 +247,9 @@ Available tabs:
 			# Start with text description
 			content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]

-			# Deduplicate screenshots, keeping only the most recent of each unique image
-			unique_screenshots = self._deduplicate_screenshots(self.screenshots)
-
 			# Add screenshots with labels
-			for i, screenshot in enumerate(unique_screenshots):
-				if i == len(unique_screenshots) - 1:
+			for i, screenshot in enumerate(self.screenshots):
+				if i == len(self.screenshots) - 1:
 					label = 'Current screenshot:'
 				else:
 					# Use simple, accurate labeling since we don't have actual step timing info
@@ -302,6 +269,6 @@ Available tabs:
 					)
 				)

-			return UserMessage(content=content_parts)
+			return UserMessage(content=content_parts, cache=True)

-		return UserMessage(content=state_description)
+		return UserMessage(content=state_description, cache=True)
--- a/browser_use/agent/service.py
+++ b/browser_use/agent/service.py
@@ -3,7 +3,6 @@ import gc
 import inspect
 import json
 import logging
-import os
 import re
 import sys
 import tempfile
@@ -163,7 +162,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		override_system_message: str | None = None,
 		extend_system_message: str | None = None,
 		validate_output: bool = False,
-		message_context: str | None = None,
 		generate_gif: bool | str = False,
 		available_file_paths: list[str] | None = None,
 		include_attributes: list[str] = DEFAULT_INCLUDE_ATTRIBUTES,
@@ -171,7 +169,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		use_thinking: bool = True,
 		flash_mode: bool = False,
 		max_history_items: int = 40,
-		images_per_step: int = 1,
 		page_extraction_llm: BaseChatModel | None = None,
 		planner_llm: BaseChatModel | None = None,  # Deprecated
 		planner_interval: int = 1,  # Deprecated
@@ -253,14 +250,12 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			override_system_message=override_system_message,
 			extend_system_message=extend_system_message,
 			validate_output=validate_output,
-			message_context=message_context,
 			generate_gif=generate_gif,
 			include_attributes=include_attributes,
 			max_actions_per_step=max_actions_per_step,
 			use_thinking=use_thinking,
 			flash_mode=flash_mode,
 			max_history_items=max_history_items,
-			images_per_step=images_per_step,
 			page_extraction_llm=page_extraction_llm,
 			planner_llm=None,  # Always None now (deprecated)
 			planner_interval=1,  # Always 1 now (deprecated)
@@ -281,8 +276,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		# Initialize state
 		self.state = injected_agent_state or AgentState()

-		# Initialize file system
+		# Initialize history
+		self.history = AgentHistoryList(history=[], usage=None)
+
+		# Initialize agent directory
+		import time
+
+		timestamp = int(time.time())
+		base_tmp = Path(tempfile.gettempdir())
+		self.agent_directory = base_tmp / f'browser_use_agent_{self.id}_{timestamp}'
+
+		# Initialize file system and screenshot service
 		self._set_file_system(file_system_path)
+		self._set_screenshot_service()

 		# Action setup
 		self._setup_action_models()
@@ -334,10 +340,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			use_thinking=self.settings.use_thinking,
 			# Settings that were previously in MessageManagerSettings
 			include_attributes=self.settings.include_attributes,
-			message_context=self.settings.message_context,
 			sensitive_data=sensitive_data,
 			max_history_items=self.settings.max_history_items,
-			images_per_step=self.settings.images_per_step,
 			vision_detail_level=self.settings.vision_detail_level,
 			include_tool_call_examples=self.settings.include_tool_call_examples,
 		)
@@ -562,10 +566,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				self.file_system = FileSystem(file_system_path)
 				self.file_system_path = file_system_path
 			else:
-				# create a temporary file system using agent ID
-				base_tmp = tempfile.gettempdir()  # e.g., /tmp on Unix
-				self.file_system_path = os.path.join(base_tmp, f'browser_use_agent_{self.id}')
-				self.file_system = FileSystem(self.file_system_path)
+				# Use the agent directory for file system
+				self.file_system = FileSystem(self.agent_directory)
+				self.file_system_path = str(self.agent_directory)
 		except Exception as e:
 			logger.error(f'💾 Failed to initialize file system: {e}.')
 			raise e
@@ -575,6 +578,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):

 		logger.info(f'💾 File system path: {self.file_system_path}')

+	def _set_screenshot_service(self) -> None:
+		"""Initialize screenshot service using agent directory"""
+		try:
+			from browser_use.screenshots.service import ScreenshotService
+
+			self.screenshot_service = ScreenshotService(self.agent_directory)
+			logger.info(f'📸 Screenshot service initialized in: {self.agent_directory}/screenshots')
+		except Exception as e:
+			logger.error(f'📸 Failed to initialize screenshot service: {e}.')
+			raise e
+
 	def save_file_system_state(self) -> None:
 		"""Save current file system state to agent state"""
 		if self.file_system:
@@ -583,9 +597,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			logger.error('💾 File system is not set up. Cannot save state.')
 			raise ValueError('File system is not set up. Cannot save state.')

-	def _set_message_context(self) -> str | None:
-		return self.settings.message_context
-
 	def _set_browser_use_version_and_source(self, source_override: str | None = None) -> None:
 		"""Get the version from pyproject.toml and determine the source of the browser-use package"""
 		# Use the helper function for version detection
@@ -696,20 +707,20 @@ class Agent(Generic[Context, AgentStructuredOutput]):

 		assert self.browser_session is not None, 'BrowserSession is not set up'

-		self.logger.debug(f'🌐 Step {self.state.n_steps + 1}: Getting browser state...')
+		self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
 		browser_state_summary = await self.browser_session.get_browser_state_with_recovery(
 			cache_clickable_elements_hashes=True, include_screenshot=self.settings.use_vision
 		)
 		current_page = await self.browser_session.get_current_page()

 		# Check for new downloads after getting browser state (catches PDF auto-downloads and previous step downloads)
-		await self._check_and_update_downloads(f'Step {self.state.n_steps + 1}: after getting browser state')
+		await self._check_and_update_downloads(f'Step {self.state.n_steps}: after getting browser state')

 		self._log_step_context(current_page, browser_state_summary)
 		await self._raise_if_stopped_or_paused()

 		# Update action models with page-specific actions
-		self.logger.debug(f'📝 Step {self.state.n_steps + 1}: Updating action models...')
+		self.logger.debug(f'📝 Step {self.state.n_steps}: Updating action models...')
 		await self._update_action_models_for_page(current_page)

 		# Get page-specific filtered actions
@@ -720,7 +731,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			page_action_message = f'For this page, these additional actions are available:\n{page_filtered_actions}'
 			self._message_manager._add_message_with_type(UserMessage(content=page_action_message), 'consistent')

-		self.logger.debug(f'💬 Step {self.state.n_steps + 1}: Adding state message to context...')
+		self.logger.debug(f'💬 Step {self.state.n_steps}: Adding state message to context...')
 		self._message_manager.add_state_message(
 			browser_state_summary=browser_state_summary,
 			model_output=self.state.last_model_output,
@@ -729,7 +740,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			use_vision=self.settings.use_vision,
 			page_filtered_actions=page_filtered_actions if page_filtered_actions else None,
 			sensitive_data=self.sensitive_data,
-			agent_history_list=self.state.history,  # Pass AgentHistoryList for screenshots
 			available_file_paths=self.available_file_paths,  # Always pass current available_file_paths
 		)

@@ -741,7 +751,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		"""Execute LLM interaction with retry logic and handle callbacks"""
 		input_messages = self._message_manager.get_messages()
 		self.logger.debug(
-			f'🤖 Step {self.state.n_steps + 1}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
+			f'🤖 Step {self.state.n_steps}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
 		)

 		try:
@@ -758,9 +768,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		# Check again for paused/stopped state after getting model output
 		await self._raise_if_stopped_or_paused()

-		# Increment step counter at the start of each step
-		self.state.n_steps += 1
-
 		# Handle callbacks and conversation saving
 		await self._handle_post_llm_processing(browser_state_summary, input_messages)

@@ -854,7 +861,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			)

 			# Use _make_history_item like main branch
-			self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)
+			await self._make_history_item(self.state.last_model_output, browser_state_summary, self.state.last_result, metadata)

 		# Log step completion summary
 		self._log_step_completion_summary(self.step_start_time, self.state.last_result)
@@ -877,6 +884,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			)
 			self.eventbus.dispatch(step_event)

+		# Increment step counter after step is fully completed
+		self.state.n_steps += 1
+
 	async def _handle_final_step(self, step_info: AgentStepInfo | None = None) -> None:
 		"""Handle special processing for the last step"""
 		if step_info and step_info.is_last_step():
@@ -893,7 +903,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		"""Get model output with retry logic for empty actions"""
 		model_output = await self.get_model_output(input_messages)
 		self.logger.debug(
-			f'✅ Step {self.state.n_steps + 1}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
+			f'✅ Step {self.state.n_steps}: Got LLM response with {len(model_output.action) if model_output.action else 0} actions'
 		)

 		if (
@@ -947,7 +957,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				self.settings.save_conversation_path_encoding,
 			)

-	def _make_history_item(
+	async def _make_history_item(
 		self,
 		model_output: AgentOutput | None,
 		browser_state_summary: BrowserStateSummary,
@@ -961,12 +971,17 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		else:
 			interacted_elements = [None]

+		# Store screenshot and get path
+		screenshot_path = None
+		if browser_state_summary.screenshot:
+			screenshot_path = await self.screenshot_service.store_screenshot(browser_state_summary.screenshot, self.state.n_steps)
+
 		state_history = BrowserStateHistory(
 			url=browser_state_summary.url,
 			title=browser_state_summary.title,
 			tabs=browser_state_summary.tabs,
 			interacted_element=interacted_elements,
-			screenshot=browser_state_summary.screenshot,
+			screenshot_path=screenshot_path,
 		)

 		history_item = AgentHistory(
@@ -976,7 +991,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 			metadata=metadata,
 		)

-		self.state.history.history.append(history_item)
+		self.history.add_item(history_item)

 	def _remove_think_tags(self, text: str) -> str:
 		THINK_TAGS = re.compile(r'<think>.*?</think>', re.DOTALL)
@@ -1021,7 +1036,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		url_short = current_page.url[:50] + '...' if len(current_page.url) > 50 else current_page.url
 		interactive_count = len(browser_state_summary.selector_map) if browser_state_summary else 0
 		self.logger.info(
-			f'📍 Step {self.state.n_steps + 1}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
+			f'📍 Step {self.state.n_steps}: Evaluating page with {interactive_count} interactive elements on: {url_short}'
 		)

 	def _log_next_action_summary(self, parsed: 'AgentOutput') -> None:
@@ -1094,7 +1109,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):

 		# Prepare action_history data correctly
 		action_history_data = []
-		for item in self.state.history.history:
+		for item in self.history.history:
 			if item.model_output and item.model_output.action:
 				# Convert each ActionModel in the step to its dictionary representation
 				step_actions = [
@@ -1107,7 +1122,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				# Append None or [] if a step had no actions or no model output
 				action_history_data.append(None)

-		final_res = self.state.history.final_result()
+		final_res = self.history.final_result()
 		final_result_str = json.dumps(final_res) if final_res is not None else None

 		self.telemetry.capture(
@@ -1125,13 +1140,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				cdp_url=urlparse(self.browser_session.cdp_url).hostname
 				if self.browser_session and self.browser_session.cdp_url
 				else None,
-				action_errors=self.state.history.errors(),
+				action_errors=self.history.errors(),
 				action_history=action_history_data,
-				urls_visited=self.state.history.urls(),
+				urls_visited=self.history.urls(),
 				steps=self.state.n_steps,
 				total_input_tokens=token_summary.prompt_tokens,
-				total_duration_seconds=self.state.history.total_duration_seconds(),
-				success=self.state.history.is_successful(),
+				total_duration_seconds=self.history.total_duration_seconds(),
+				success=self.history.is_successful(),
 				final_result_response=final_result_str,
 				error_message=agent_run_error,
 			)
@@ -1145,13 +1160,13 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		"""
 		await self.step(step_info)

-		if self.state.history.is_done():
+		if self.history.is_done():
 			await self.log_completion()
 			if self.register_done_callback:
 				if inspect.iscoroutinefunction(self.register_done_callback):
-					await self.register_done_callback(self.state.history)
+					await self.register_done_callback(self.history)
 				else:
-					self.register_done_callback(self.state.history)
+					self.register_done_callback(self.history)
 			return True, True

 		return False, False
@@ -1271,22 +1286,22 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				if on_step_end is not None:
 					await on_step_end(self)

-				if self.state.history.is_done():
+				if self.history.is_done():
 					self.logger.debug(f'🎯 Task completed after {step + 1} steps!')
 					await self.log_completion()

 					if self.register_done_callback:
 						if inspect.iscoroutinefunction(self.register_done_callback):
-							await self.register_done_callback(self.state.history)
+							await self.register_done_callback(self.history)
 						else:
-							self.register_done_callback(self.state.history)
+							self.register_done_callback(self.history)

 					# Task completed
 					break
 			else:
 				agent_run_error = 'Failed to complete task in maximum steps'

-				self.state.history.history.append(
+				self.history.add_item(
 					AgentHistory(
 						model_output=None,
 						result=[ActionResult(error=agent_run_error, include_in_memory=True)],
@@ -1295,7 +1310,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 							title='',
 							tabs=[],
 							interacted_element=[],
-							screenshot=None,
+							screenshot_path=None,
 						),
 						metadata=None,
 					)
@@ -1304,23 +1319,23 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				self.logger.info(f'❌ {agent_run_error}')

 			self.logger.debug('📊 Collecting usage summary...')
-			self.state.history.usage = await self.token_cost_service.get_usage_summary()
+			self.history.usage = await self.token_cost_service.get_usage_summary()

 			# set the model output schema and call it on the fly
-			if self.state.history._output_model_schema is None and self.output_model_schema is not None:
-				self.state.history._output_model_schema = self.output_model_schema
+			if self.history._output_model_schema is None and self.output_model_schema is not None:
+				self.history._output_model_schema = self.output_model_schema

 			self.logger.debug('🏁 Agent.run() completed successfully')
-			return self.state.history
+			return self.history

 		except KeyboardInterrupt:
 			# Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
 			self.logger.info('Got KeyboardInterrupt during execution, returning current history')
 			agent_run_error = 'KeyboardInterrupt'

-			self.state.history.usage = await self.token_cost_service.get_usage_summary()
+			self.history.usage = await self.token_cost_service.get_usage_summary()

-			return self.state.history
+			return self.history

 		except Exception as e:
 			self.logger.error(f'Agent run failed with exception: {e}', exc_info=True)
@@ -1359,7 +1374,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				# Lazy import gif module to avoid heavy startup cost
 				from browser_use.agent.gif import create_history_gif

-				create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
+				create_history_gif(task=self.task, history=self.history, output_path=output_path)

 				# Only emit output file event if GIF was actually created
 				if Path(output_path).exists():
@@ -1484,7 +1499,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):

 	async def log_completion(self) -> None:
 		"""Log the completion of the task"""
-		if self.state.history.is_successful():
+		if self.history.is_successful():
 			self.logger.info('✅ Task completed successfully')
 		else:
 			self.logger.info('❌ Task completed without success')
@@ -1618,7 +1633,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		"""Save the history to a file"""
 		if not file_path:
 			file_path = 'AgentHistory.json'
-		self.state.history.save_to_file(file_path)
+		self.history.save_to_file(file_path)

 	async def wait_until_resumed(self):
 		await self._external_pause_event.wait()
@@ -1756,14 +1771,14 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 		timestamp = datetime.now().isoformat()

 		# Only declare variables that are used multiple times
-		structured_output = self.state.history.structured_output
+		structured_output = self.history.structured_output
 		structured_output_json = json.dumps(structured_output.model_dump()) if structured_output else None
-		final_result = self.state.history.final_result()
+		final_result = self.history.final_result()
 		git_info = get_git_info()
-		action_history = self.state.history.action_history()
-		action_errors = self.state.history.errors()
-		urls = self.state.history.urls()
-		usage = self.state.history.usage
+		action_history = self.history.action_history()
+		action_errors = self.history.errors()
+		urls = self.history.urls()
+		usage = self.history.usage

 		return {
 			'trace': {
@@ -1790,10 +1805,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				'final_result_response_truncated': (
 					final_result[:20000] if final_result and len(final_result) > 20000 else final_result
 				),
-				'self_report_completed': 1 if self.state.history.is_done() else 0,
-				'self_report_success': 1 if self.state.history.is_successful() else 0,
-				'duration': self.state.history.total_duration_seconds(),
-				'steps_taken': self.state.history.number_of_steps(),
+				'self_report_completed': 1 if self.history.is_done() else 0,
+				'self_report_success': 1 if self.history.is_successful() else 0,
+				'duration': self.history.total_duration_seconds(),
+				'steps_taken': self.history.number_of_steps(),
 				'usage': json.dumps(usage.model_dump()) if usage else None,
 			},
 			'trace_details': {
@@ -1805,6 +1820,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
 				# AgentHistoryList methods
 				'structured_output': structured_output_json,
 				'final_result_response': final_result,
-				'complete_history': _get_complete_history_without_screenshots(self.state.history.model_dump()),
+				'complete_history': _get_complete_history_without_screenshots(self.history.model_dump()),
 			},
 		}
--- a/browser_use/agent/views.py
+++ b/browser_use/agent/views.py
@@ -36,7 +36,6 @@ class AgentSettings(BaseModel):
 	max_failures: int = 3
 	retry_delay: int = 10
 	validate_output: bool = False
-	message_context: str | None = None
 	generate_gif: bool | str = False
 	override_system_message: str | None = None
 	extend_system_message: str | None = None
@@ -56,7 +55,6 @@ class AgentSettings(BaseModel):
 	use_thinking: bool = True
 	flash_mode: bool = False  # If enabled, disables evaluation_previous_goal and next_goal, and sets use_thinking = False
 	max_history_items: int = 40
-	images_per_step: int = 1

 	page_extraction_llm: BaseChatModel | None = None
 	planner_llm: BaseChatModel | None = None
@@ -76,7 +74,6 @@ class AgentState(BaseModel):
 	n_steps: int = 1
 	consecutive_failures: int = 0
 	last_result: list[ActionResult] | None = None
-	history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[], usage=None))
 	last_plan: str | None = None
 	last_model_output: AgentOutput | None = None
 	paused: bool = False
@@ -329,6 +326,10 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
 		"""Representation of the AgentHistoryList object"""
 		return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})'

+	def add_item(self, history_item: AgentHistory) -> None:
+		"""Add a history item to the list"""
+		self.history.append(history_item)
+
 	def __repr__(self) -> str:
 		"""Representation of the AgentHistoryList object"""
 		return self.__str__()
@@ -443,20 +444,39 @@ class AgentHistoryList(BaseModel, Generic[AgentStructuredOutput]):
 		"""Get all unique URLs from history"""
 		return [h.state.url if h.state.url is not None else None for h in self.history]

-	def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
-		"""Get all screenshots from history"""
+	def screenshot_paths(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
+		"""Get all screenshot paths from history"""
 		if n_last == 0:
 			return []
 		if n_last is None:
 			if return_none_if_not_screenshot:
-				return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history]
+				return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history]
 			else:
-				return [h.state.screenshot for h in self.history if h.state.screenshot is not None]
+				return [h.state.screenshot_path for h in self.history if h.state.screenshot_path is not None]
 		else:
 			if return_none_if_not_screenshot:
-				return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history[-n_last:]]
+				return [h.state.screenshot_path if h.state.screenshot_path is not None else None for h in self.history[-n_last:]]
 			else:
-				return [h.state.screenshot for h in self.history[-n_last:] if h.state.screenshot is not None]
+				return [h.state.screenshot_path for h in self.history[-n_last:] if h.state.screenshot_path is not None]
+
+	def screenshots(self, n_last: int | None = None, return_none_if_not_screenshot: bool = True) -> list[str | None]:
+		"""Get all screenshots from history as base64 strings"""
+		if n_last == 0:
+			return []
+
+		history_items = self.history if n_last is None else self.history[-n_last:]
+		screenshots = []
+
+		for item in history_items:
+			screenshot_b64 = item.state.get_screenshot()
+			if screenshot_b64:
+				screenshots.append(screenshot_b64)
+			else:
+				if return_none_if_not_screenshot:
+					screenshots.append(None)
+				# If return_none_if_not_screenshot is False, we skip None values
+
+		return screenshots

 	def action_names(self) -> list[str]:
 		"""Get all action names from history"""
--- a/browser_use/browser/views.py
+++ b/browser_use/browser/views.py
@@ -81,12 +81,31 @@ class BrowserStateHistory:
 	title: str
 	tabs: list[TabInfo]
 	interacted_element: list[DOMHistoryElement | None] | list[None]
-	screenshot: str | None = None
+	screenshot_path: str | None = None
+
+	def get_screenshot(self) -> str | None:
+		"""Load screenshot from disk and return as base64 string"""
+		if not self.screenshot_path:
+			return None
+
+		import base64
+		from pathlib import Path
+
+		path_obj = Path(self.screenshot_path)
+		if not path_obj.exists():
+			return None
+
+		try:
+			with open(path_obj, 'rb') as f:
+				screenshot_data = f.read()
+			return base64.b64encode(screenshot_data).decode('utf-8')
+		except Exception:
+			return None

 	def to_dict(self) -> dict[str, Any]:
 		data = {}
 		data['tabs'] = [tab.model_dump() for tab in self.tabs]
-		data['screenshot'] = self.screenshot
+		data['screenshot_path'] = self.screenshot_path
 		data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
 		data['url'] = self.url
 		data['title'] = self.title
--- a/browser_use/cli.py
+++ b/browser_use/cli.py
@@ -815,18 +815,18 @@ class BrowserUseApp(App):
 			# Show token usage statistics if agent exists and has history
 			if self.agent and hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
 				# Get total tokens used
-				# total_tokens = self.agent.state.history.total_input_tokens()
+				# total_tokens = self.agent.history.total_input_tokens()
 				# model_info.write(f'[white]Input tokens:[/] [green]{total_tokens:,}[/]')

 				# Calculate tokens per step
-				num_steps = len(self.agent.state.history.history)
+				num_steps = len(self.agent.history.history)
 				# if num_steps > 0:
 				# avg_tokens_per_step = total_tokens / num_steps
 				# model_info.write(f'[white]Avg tokens/step:[/] [green]{avg_tokens_per_step:,.1f}[/]')

 				# Get the last step metadata to show the most recent LLM response time
-				if num_steps > 0 and self.agent.state.history.history[-1].metadata:
-					last_step = self.agent.state.history.history[-1]
+				if num_steps > 0 and self.agent.history.history[-1].metadata:
+					last_step = self.agent.history.history[-1]
 					if last_step.metadata:
 						step_duration = last_step.metadata.duration_seconds
 					else:
@@ -838,7 +838,7 @@ class BrowserUseApp(App):
 					# 	model_info.write(f'[white]Avg tokens/sec:[/] [magenta]{tokens_per_second:.1f}[/]')

 				# Show total duration
-				total_duration = self.agent.state.history.total_duration_seconds()
+				total_duration = self.agent.history.total_duration_seconds()
 				if total_duration > 0:
 					model_info.write(f'[white]Total Duration:[/] [magenta]{total_duration:.2f}s[/]')

@@ -891,7 +891,7 @@ class BrowserUseApp(App):
 			# Get all agent history items
 			history_items = []
 			if hasattr(self.agent, 'state') and hasattr(self.agent.state, 'history'):
-				history_items = self.agent.state.history.history
+				history_items = self.agent.history.history

 				if history_items:
 					tasks_info.write('[bold yellow]STEPS:[/]')
--- a/browser_use/llm/google/chat.py
+++ b/browser_use/llm/google/chat.py
@@ -371,6 +371,7 @@ class ChatGoogle(BaseChatModel):
 							key == 'properties'
 							and isinstance(cleaned_value, dict)
 							and len(cleaned_value) == 0
+							and isinstance(obj.get('type', ''), str)
 							and obj.get('type', '').upper() == 'OBJECT'
 						):
 							# Convert empty object to have at least one property
@@ -380,7 +381,8 @@ class ChatGoogle(BaseChatModel):

 				# If this is an object type with empty properties, add a placeholder
 				if (
-					cleaned.get('type', '').upper() == 'OBJECT'
+					isinstance(cleaned.get('type', ''), str)
+					and cleaned.get('type', '').upper() == 'OBJECT'
 					and 'properties' in cleaned
 					and isinstance(cleaned['properties'], dict)
 					and len(cleaned['properties']) == 0
--- a/browser_use/screenshots/init.py
+++ b/browser_use/screenshots/init.py
@@ -0,0 +1 @@
+# Screenshots package for browser-use
--- a/browser_use/screenshots/service.py
+++ b/browser_use/screenshots/service.py
@@ -0,0 +1,48 @@
+"""
+Screenshot storage service for browser-use agents.
+"""
+
+import base64
+from pathlib import Path
+
+import anyio
+
+
+class ScreenshotService:
+	"""Simple screenshot storage service that saves screenshots to disk"""
+
+	def __init__(self, agent_directory: str | Path):
+		"""Initialize with agent directory path"""
+		self.agent_directory = Path(agent_directory) if isinstance(agent_directory, str) else agent_directory
+
+		# Create screenshots subdirectory
+		self.screenshots_dir = self.agent_directory / 'screenshots'
+		self.screenshots_dir.mkdir(parents=True, exist_ok=True)
+
+	async def store_screenshot(self, screenshot_b64: str, step_number: int) -> str:
+		"""Store screenshot to disk and return the full path as string"""
+		screenshot_filename = f'step_{step_number}.png'
+		screenshot_path = self.screenshots_dir / screenshot_filename
+
+		# Decode base64 and save to disk
+		screenshot_data = base64.b64decode(screenshot_b64)
+
+		async with await anyio.open_file(screenshot_path, 'wb') as f:
+			await f.write(screenshot_data)
+
+		return str(screenshot_path)
+
+	async def get_screenshot(self, screenshot_path: str) -> str | None:
+		"""Load screenshot from disk path and return as base64"""
+		if not screenshot_path:
+			return None
+
+		path = Path(screenshot_path)
+		if not path.exists():
+			return None
+
+		# Load from disk and encode to base64
+		async with await anyio.open_file(path, 'rb') as f:
+			screenshot_data = await f.read()
+
+		return base64.b64encode(screenshot_data).decode('utf-8')
--- a/docs/customize/agent-settings.mdx
+++ b/docs/customize/agent-settings.mdx
@@ -133,7 +133,7 @@ history = await agent.run()

 # Access (some) useful information
 history.urls()              # List of visited URLs
-history.screenshots()       # List of screenshot paths
+history.screenshot_paths()       # List of screenshot paths
 history.action_names()      # Names of executed actions
 history.extracted_content() # Content extracted during execution
 history.errors()           # Any errors that occurred
@@ -173,60 +173,12 @@ agent = Agent(
 )
 ```

-## Run with message context

-You can configure the agent and provide a separate message to help the LLM understand the task better.

-```python
-from browser_use.llm import ChatOpenAI

-agent = Agent(
-    task="your task",
-    message_context="Additional information about the task",
-    llm = ChatOpenAI(model='gpt-4o')
-)
-```
-
-## Run with planner model
-
-You can configure the agent to use a separate planner model for high-level task planning:
-
-```python
-from browser_use.llm import ChatOpenAI
-
-# Initialize models
-llm = ChatOpenAI(model='gpt-4o')
-planner_llm = ChatOpenAI(model='o3-mini')
-
-agent = Agent(
-    task="your task",
-    llm=llm,
-    planner_llm=planner_llm,           # Separate model for planning
-    use_vision_for_planner=False,      # Disable vision for planner
-    planner_interval=4                 # Plan every 4 steps
-)
-```
-
-### Planner Parameters
-
- `planner_llm`: A chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM.
- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`.
- `planner_interval`: Number of steps between planning phases. Defaults to `1`.
-
-Using a separate planner model can help:
-
- Reduce costs by using a smaller model for high-level planning
- Improve task decomposition and strategic thinking
- Better handle complex, multi-step tasks
-
-<Note>
-  The planner model is optional. If not specified, the agent will not use the
-  planner model.
-</Note>

 ### Optional Parameters

- `message_context`: Additional information about the task to help the LLM understand the task better.
 - `initial_actions`: List of initial actions to run before the main task.
 - `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`.
 - `max_failures`: Maximum number of failures before giving up. Defaults to `3`.
--- a/docs/customize/hooks.mdx
+++ b/docs/customize/hooks.mdx
@@ -41,7 +41,7 @@ async def my_step_hook(agent: Agent):
    #   https://playwright.dev/python/docs/api/class-page

    current_url = page.url
-    visit_log = agent.state.history.urls()
+    visit_log = agent.history.urls()
    previous_url = visit_log[-2] if len(visit_log) >= 2 else None
    print(f"Agent was last on URL: {previous_url} and is now on {current_url}")

@@ -91,11 +91,12 @@ When working with agent hooks, you have access to the entire `Agent` instance. H
 - `agent.settings` contains all the configuration options passed to the `Agent(...)` at init time
 - `agent.llm` gives direct access to the main LLM object (e.g. `ChatOpenAI`)
 - `agent.state` gives access to lots of internal state, including agent thoughts, outputs, actions, etc.
-  - `agent.state.history.model_thoughts()`: Reasoning from Browser Use's model.
-  - `agent.state.history.model_outputs()`: Raw outputs from the Browsre Use's model.
-  - `agent.state.history.model_actions()`: Actions taken by the agent
-  - `agent.state.history.extracted_content()`: Content extracted from web pages
-  - `agent.state.history.urls()`: URLs visited by the agent
+- `agent.history` gives access to historical data from the agent's execution:
+  - `agent.history.model_thoughts()`: Reasoning from Browser Use's model.
+  - `agent.history.model_outputs()`: Raw outputs from the Browser Use's model.
+  - `agent.history.model_actions()`: Actions taken by the agent
+  - `agent.history.extracted_content()`: Content extracted from web pages
+  - `agent.history.urls()`: URLs visited by the agent
 - `agent.browser_session` gives direct access to the `BrowserSession()` and playwright objects
  - `agent.browser_session.get_current_page()`: Get the current playwright `Page` object the agent is focused on
  - `agent.browser_session.browser_context`: Get the current playwright `BrowserContext` object
--- a/examples/custom-functions/custom_hooks_before_after_step.py
+++ b/examples/custom-functions/custom_hooks_before_after_step.py
@@ -154,7 +154,7 @@ async def record_activity(agent_obj):
 	print('--> History:')
 	# Assert agent has state to satisfy type checker
 	assert hasattr(agent_obj, 'state'), 'Agent must have state attribute'
-	history = agent_obj.state.history
+	history = agent_obj.history

 	model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)

@@ -164,7 +164,7 @@ async def record_activity(agent_obj):
 		# prettyprinter.cpprint(model_thoughts_last_elem)

 	# print("--- MODEL OUTPUT ACTION ---")
-	model_outputs = agent_obj.state.history.model_outputs()
+	model_outputs = agent_obj.history.model_outputs()
 	model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False)

 	if len(model_outputs_json) > 0:
@@ -172,7 +172,7 @@ async def record_activity(agent_obj):
 		# prettyprinter.cpprint(model_outputs_json_last_elem)

 	# print("--- MODEL INTERACTED ELEM ---")
-	model_actions = agent_obj.state.history.model_actions()
+	model_actions = agent_obj.history.model_actions()
 	model_actions_json = obj_to_json(obj=model_actions, check_circular=False)

 	if len(model_actions_json) > 0:
@@ -180,14 +180,14 @@ async def record_activity(agent_obj):
 		# prettyprinter.cpprint(model_actions_json_last_elem)

 	# print("--- EXTRACTED CONTENT ---")
-	extracted_content = agent_obj.state.history.extracted_content()
+	extracted_content = agent_obj.history.extracted_content()
 	extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False)
 	if len(extracted_content_json) > 0:
 		extracted_content_json_last_elem = extracted_content_json[-1]
 		# prettyprinter.cpprint(extracted_content_json_last_elem)

 	# print("--- URLS ---")
-	urls = agent_obj.state.history.urls()
+	urls = agent_obj.history.urls()
 	# prettyprinter.cpprint(urls)
 	urls_json = obj_to_json(obj=urls, check_circular=False)

--- a/examples/features/outsource_state.py
+++ b/examples/features/outsource_state.py
@@ -47,8 +47,6 @@ async def main():
 		if done and valid:
 			break

-		agent_state.history.history = []
-
 		# Save state to file
 		async with await anyio.open_file('agent_state.json', 'w') as f:
 			serialized = agent_state.model_dump_json(exclude={'history'})
--- a/examples/models/deepseek-chat.py
+++ b/examples/models/deepseek-chat.py
@@ -28,7 +28,7 @@ async def main():
 		task='What should we pay attention to in the recent new rules on tariffs in China-US trade?',
 		llm=llm,
 		use_vision=False,
-		message_context=extend_system_message,
+		extend_system_message=extend_system_message,
 	)
 	await agent.run()

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "browser-use"
 description = "Make websites accessible for AI agents"
 authors = [{ name = "Gregor Zunic" }]
-version = "0.5.7"
+version = "0.5.9"
 readme = "README.md"
 requires-python = ">=3.11,<4.0"
 classifiers = [
--- a/tests/ci/test_gemini_type_field_fix.py
+++ b/tests/ci/test_gemini_type_field_fix.py
@@ -0,0 +1,107 @@
+"""
+Test to reproduce and verify fix for GitHub issue #2470:
+"Python field with name 'type' handled differently between Gemini and OpenAI GPT"
+"""
+
+from browser_use.llm.google.chat import ChatGoogle
+from browser_use.llm.schema import SchemaOptimizer
+
+
+class TestGeminiTypeFieldHandling:
+	"""Test class for reproducing the type field issue with Gemini schema processing."""
+
+	def test_gemini_schema_with_dict_type_field(self):
+		"""
+		Test that Gemini schema processing handles dict 'type' field gracefully.
+		Reproduces the AttributeError: 'dict' object has no attribute 'upper'
+		"""
+		chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+		# Schema with dict instead of string in type field
+		problematic_schema = {'type': {'malformed': 'dict_type'}, 'properties': {}}
+
+		result = chat_google._fix_gemini_schema(problematic_schema)
+		assert result is not None
+		assert isinstance(result, dict)
+		assert result['type'] == {'malformed': 'dict_type'}
+
+	def test_gemini_schema_with_nested_dict_type_field(self):
+		"""
+		Test that nested dict 'type' fields are handled gracefully.
+		"""
+		chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+		# Schema with nested dict type field
+		problematic_schema = {
+			'type': 'object',
+			'properties': {'nested_field': {'type': {'malformed': 'dict_instead_of_string'}, 'properties': {}}},
+		}
+
+		result = chat_google._fix_gemini_schema(problematic_schema)
+		assert result is not None
+		assert isinstance(result, dict)
+		nested_type = result['properties']['nested_field']['type']
+		assert nested_type == {'malformed': 'dict_instead_of_string'}
+
+	def test_gemini_schema_with_none_type_field(self):
+		"""Test handling of None type field."""
+		chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+		problematic_schema = {'type': 'object', 'properties': {'nested_field': {'type': None, 'properties': {}}}}
+
+		result = chat_google._fix_gemini_schema(problematic_schema)
+		assert result is not None
+
+	def test_gemini_schema_with_valid_string_type(self):
+		"""Test that valid string type fields work correctly."""
+		chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+		valid_schema = {'type': 'object', 'properties': {'nested_field': {'type': 'object', 'properties': {}}}}
+
+		# Should work without issues
+		result = chat_google._fix_gemini_schema(valid_schema)
+		assert result is not None
+		assert isinstance(result, dict)
+
+	def test_gemini_schema_with_empty_properties_object(self):
+		"""Test handling of empty properties in object type."""
+		chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+
+		schema_with_empty_props = {
+			'type': 'object',
+			'properties': {
+				'empty_object': {
+					'type': 'object',
+					'properties': {},  # Empty properties should get placeholder
+				}
+			},
+		}
+
+		result = chat_google._fix_gemini_schema(schema_with_empty_props)
+
+		nested_props = result['properties']['empty_object']['properties']
+		assert '_placeholder' in nested_props
+		assert nested_props['_placeholder']['type'] == 'string'
+
+	def test_consistency_between_providers(self):
+		"""
+		Test that both Gemini and OpenAI handle schemas consistently.
+		The original issue was that Gemini would fail where OpenAI succeeded.
+		"""
+		from pydantic import BaseModel, Field
+
+		# Create a test model that generates a schema with dict type
+		class TestModel(BaseModel):
+			field_with_dict_type: dict = Field(default_factory=dict)
+
+		# OpenAI uses SchemaOptimizer directly
+		openai_schema = SchemaOptimizer.create_optimized_json_schema(TestModel)
+		assert openai_schema is not None
+
+		# Gemini processes the schema through _fix_gemini_schema
+		chat_google = ChatGoogle(model='gemini-2.0-flash-exp')
+		gemini_result = chat_google._fix_gemini_schema(openai_schema)
+		assert gemini_result is not None
+
+		# Both should handle the schema without errors
+		# This demonstrates that the fix makes Gemini consistent with OpenAI
--- a/tests/ci/test_gif_filtering.py
+++ b/tests/ci/test_gif_filtering.py
@@ -9,7 +9,7 @@ from PIL import Image
 from browser_use import AgentHistoryList
 from browser_use.agent.gif import create_history_gif
 from browser_use.agent.views import ActionResult, AgentHistory, AgentOutput
-from browser_use.browser.views import PLACEHOLDER_4PX_SCREENSHOT, BrowserStateHistory, TabInfo
+from browser_use.browser.views import BrowserStateHistory, TabInfo


@pytest.fixture
@@ -49,9 +49,22 @@ def create_test_screenshot(width: int = 800, height: int = 600, color: tuple = (

 async def test_gif_filters_out_placeholder_screenshots(test_dir):
 	"""Test that 4px placeholder screenshots from about:blank pages are filtered out of GIFs."""
+	# Set up screenshot service for testing (still needed to create test files)
+	from browser_use.screenshots.service import ScreenshotService
+
+	screenshot_service = ScreenshotService(test_dir)
+
+	# Helper function to store test screenshots
+	async def store_test_screenshot(screenshot_b64: str, step: int) -> str:
+		return await screenshot_service.store_screenshot(screenshot_b64, step)
+
 	# Create a history with mixed screenshots: real and placeholder
 	history_items = []

+	# Store test screenshots
+	real_screenshot_1_path = await store_test_screenshot(create_test_screenshot(800, 600, (100, 150, 200)), 2)
+	real_screenshot_2_path = await store_test_screenshot(create_test_screenshot(800, 600, (200, 100, 50)), 4)
+
 	# First item: about:blank placeholder (should be filtered)
 	history_items.append(
 		AgentHistory(
@@ -63,7 +76,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
 			),
 			result=[ActionResult()],
 			state=BrowserStateHistory(
-				screenshot=PLACEHOLDER_4PX_SCREENSHOT,
+				screenshot_path=None,  # Placeholder doesn't have a file path
 				url='about:blank',
 				title='New Tab',
 				tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],
@@ -83,7 +96,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
 			),
 			result=[ActionResult()],
 			state=BrowserStateHistory(
-				screenshot=create_test_screenshot(800, 600, (100, 150, 200)),
+				screenshot_path=real_screenshot_1_path,
 				url='https://example.com',
 				title='Example',
 				tabs=[TabInfo(page_id=1, url='https://example.com', title='Example')],
@@ -103,7 +116,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
 			),
 			result=[ActionResult()],
 			state=BrowserStateHistory(
-				screenshot=PLACEHOLDER_4PX_SCREENSHOT,
+				screenshot_path=None,  # Placeholder doesn't have a file path
 				url='about:blank',
 				title='New Tab',
 				tabs=[TabInfo(page_id=2, url='about:blank', title='New Tab')],
@@ -123,7 +136,7 @@ async def test_gif_filters_out_placeholder_screenshots(test_dir):
 			),
 			result=[ActionResult()],
 			state=BrowserStateHistory(
-				screenshot=create_test_screenshot(800, 600, (200, 100, 50)),
+				screenshot_path=real_screenshot_2_path,
 				url='https://example.com/page2',
 				title='Page 2',
 				tabs=[TabInfo(page_id=1, url='https://example.com/page2', title='Page 2')],
@@ -190,7 +203,7 @@ async def test_gif_handles_all_placeholders(test_dir):
 				),
 				result=[ActionResult()],
 				state=BrowserStateHistory(
-					screenshot=PLACEHOLDER_4PX_SCREENSHOT,
+					screenshot_path=None,  # Placeholder doesn't have a file path
 					url='about:blank',
 					title='New Tab',
 					tabs=[TabInfo(page_id=1, url='about:blank', title='New Tab')],
--- a/tests/ci/test_gif_generation_with_navigation.py
+++ b/tests/ci/test_gif_generation_with_navigation.py
@@ -85,9 +85,10 @@ async def test_gif_generation_with_real_navigation(httpserver, tmp_path):
 		# Verify history contains real screenshots (not placeholders)
 		has_real_screenshot = False
 		for item in history.history:
+			screenshot_b64 = item.state.get_screenshot()
 			if (
-				item.state.screenshot
-				and item.state.screenshot
+				screenshot_b64
+				and screenshot_b64
 				!= 'iVBORw0KGgoAAAANSUhEUgAAAAQAAAAECAIAAAAmkwkpAAAAFElEQVR4nGP8//8/AwwwMSAB3BwAlm4DBfIlvvkAAAAASUVORK5CYII='
 			):
 				has_real_screenshot = True
--- a/tests/ci/test_sync_agent_events.py
+++ b/tests/ci/test_sync_agent_events.py
@@ -88,7 +88,7 @@ class TestAgentEventLifecycle:

 		assert isinstance(step_event, CreateAgentStepEvent)
 		assert step_event.agent_task_id == task_event.id
-		assert step_event.step == 2  # Step is incremented before event is emitted
+		assert step_event.step == 1  # Step is incremented before event is emitted
 		assert step_event.url == httpserver.url_for('/')

 		assert isinstance(update_event, UpdateAgentTaskEvent)
--- a/tests/old/test_core_functionality.py
+++ b/tests/old/test_core_functionality.py
@@ -279,6 +279,6 @@ class TestCoreFunctionality:
 		assert final_scroll_position > initial_scroll_position, 'Page did not scroll down'

 		# Verify the action was executed
-		history = agent.state.history
+		history = agent.history
 		action_names = history.action_names()
 		assert 'scroll_down' in action_names