browser-use/browser_use/skill_cli/python_session.py

"""Jupyter-like persistent Python execution for browser-use CLI."""

import asyncio
import io
import traceback
from contextlib import redirect_stderr, redirect_stdout
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

if TYPE_CHECKING:
	from browser_use.browser.session import BrowserSession
	from browser_use.skill_cli.actions import ActionHandler


@dataclass
class ExecutionResult:
	"""Result of Python code execution."""

	success: bool
	output: str = ''
	error: str | None = None


@dataclass
class PythonSession:
	"""Jupyter-like persistent Python execution.

	Maintains a namespace across multiple code executions, allowing variables
	to persist between commands. Provides a `browser` object for browser control.
	"""

	namespace: dict[str, Any] = field(default_factory=dict)
	execution_count: int = 0
	history: list[tuple[str, ExecutionResult]] = field(default_factory=list)

	def __post_init__(self) -> None:
		"""Initialize namespace with useful imports."""
		self.namespace.update(
			{
				'__name__': '__main__',
				'__doc__': None,
				'json': __import__('json'),
				're': __import__('re'),
				'os': __import__('os'),
				'Path': Path,
				'asyncio': asyncio,
			}
		)

	def execute(
		self,
		code: str,
		browser_session: 'BrowserSession',
		loop: asyncio.AbstractEventLoop | None = None,
		actions: 'ActionHandler | None' = None,
	) -> ExecutionResult:
		"""Execute code in persistent namespace.

		The `browser` variable is injected into the namespace before each execution,
		providing a convenient wrapper around the BrowserSession.

		Args:
			code: Python code to execute
			browser_session: The browser session for browser operations
			loop: The event loop for async operations (required for browser access)
			actions: Optional ActionHandler for direct execution (no event bus)
		"""
		# Inject browser wrapper with the event loop for async operations
		if loop is not None and actions is not None:
			self.namespace['browser'] = BrowserWrapper(browser_session, loop, actions)
		self.execution_count += 1

		stdout = io.StringIO()
		stderr = io.StringIO()

		try:
			with redirect_stdout(stdout), redirect_stderr(stderr):
				try:
					# First try to compile as expression (for REPL-like behavior)
					compiled = compile(code, '<input>', 'eval')
					result = eval(compiled, self.namespace)
					if result is not None:
						print(repr(result))
				except SyntaxError:
					# Compile as statements
					compiled = compile(code, '<input>', 'exec')
					exec(compiled, self.namespace)

			output = stdout.getvalue()
			if stderr.getvalue():
				output += stderr.getvalue()

			result = ExecutionResult(success=True, output=output)

		except Exception as e:
			output = stdout.getvalue()
			error_msg = traceback.format_exc()
			result = ExecutionResult(success=False, output=output, error=error_msg)

		self.history.append((code, result))
		return result

	def reset(self) -> None:
		"""Clear namespace and history."""
		self.namespace.clear()
		self.history.clear()
		self.execution_count = 0
		self.__post_init__()

	def get_variables(self) -> dict[str, str]:
		"""Get user-defined variables and their types."""
		skip = {'__name__', '__doc__', 'json', 're', 'os', 'Path', 'asyncio', 'browser'}
		return {k: type(v).__name__ for k, v in self.namespace.items() if not k.startswith('_') and k not in skip}


class BrowserWrapper:
	"""Convenient browser access for Python code.

	Provides synchronous methods that wrap async BrowserSession operations.
	Runs coroutines on the server's event loop using run_coroutine_threadsafe.
	"""

	def __init__(self, session: 'BrowserSession', loop: asyncio.AbstractEventLoop, actions: 'ActionHandler') -> None:
		self._session = session
		self._loop = loop
		self._actions = actions

	def _run(self, coro: Any) -> Any:
		"""Run coroutine on the server's event loop."""
		future = asyncio.run_coroutine_threadsafe(coro, self._loop)
		return future.result(timeout=60)

	@property
	def url(self) -> str:
		"""Get current page URL."""
		return self._run(self._get_url())

	async def _get_url(self) -> str:
		state = await self._session.get_browser_state_summary(include_screenshot=False)
		return state.url if state else ''

	@property
	def title(self) -> str:
		"""Get current page title."""
		return self._run(self._get_title())

	async def _get_title(self) -> str:
		state = await self._session.get_browser_state_summary(include_screenshot=False)
		return state.title if state else ''

	def goto(self, url: str) -> None:
		"""Navigate to URL."""
		self._run(self._goto_async(url))

	async def _goto_async(self, url: str) -> None:
		await self._actions.navigate(url)

	def click(self, index: int) -> None:
		"""Click element by index."""
		self._run(self._click_async(index))

	async def _click_async(self, index: int) -> None:
		node = await self._session.get_element_by_index(index)
		if node is None:
			raise ValueError(f'Element index {index} not found')
		await self._actions.click_element(node)

	def type(self, text: str) -> None:
		"""Type text into focused element."""
		self._run(self._type_async(text))

	async def _type_async(self, text: str) -> None:
		cdp_session = await self._session.get_or_create_cdp_session(target_id=None, focus=False)
		if not cdp_session:
			raise RuntimeError('No active browser session')
		await cdp_session.cdp_client.send.Input.insertText(
			params={'text': text},
			session_id=cdp_session.session_id,
		)

	def input(self, index: int, text: str) -> None:
		"""Click element and type text."""
		self._run(self._input_async(index, text))

	async def _input_async(self, index: int, text: str) -> None:
		node = await self._session.get_element_by_index(index)
		if node is None:
			raise ValueError(f'Element index {index} not found')
		await self._actions.click_element(node)
		await self._actions.type_text(node, text)

	def upload(self, index: int, path: str) -> None:
		"""Upload a file to a file input element."""
		self._run(self._upload_async(index, path))

	async def _upload_async(self, index: int, path: str) -> None:
		from pathlib import Path as P

		file_path = str(P(path).expanduser().resolve())
		p = P(file_path)
		if not p.exists():
			raise FileNotFoundError(f'File not found: {file_path}')
		if not p.is_file():
			raise ValueError(f'Not a file: {file_path}')
		if p.stat().st_size == 0:
			raise ValueError(f'File is empty (0 bytes): {file_path}')

		node = await self._session.get_element_by_index(index)
		if node is None:
			raise ValueError(f'Element index {index} not found')

		file_input_node = self._session.find_file_input_near_element(node)
		if file_input_node is None:
			raise ValueError(f'Element {index} is not a file input and no file input found nearby')

		await self._actions.upload_file(file_input_node, file_path)

	def scroll(self, direction: Literal['up', 'down', 'left', 'right'] = 'down', amount: int = 500) -> None:
		"""Scroll the page."""
		self._run(self._scroll_async(direction, amount))

	async def _scroll_async(self, direction: Literal['up', 'down', 'left', 'right'], amount: int) -> None:
		await self._actions.scroll(direction, amount)

	def screenshot(self, path: str | None = None) -> bytes:
		"""Take screenshot, optionally save to file."""
		data = self._run(self._session.take_screenshot())
		if path:
			Path(path).write_bytes(data)
		return data

	@property
	def html(self) -> str:
		"""Get page HTML."""
		return self._run(self._get_html())

	async def _get_html(self) -> str:
		cdp_session = await self._session.get_or_create_cdp_session(target_id=None, focus=False)
		if not cdp_session:
			return ''
		# Get the document root
		doc = await cdp_session.cdp_client.send.DOM.getDocument(
			params={},
			session_id=cdp_session.session_id,
		)
		if not doc or 'root' not in doc:
			return ''
		# Get outer HTML of the root node
		result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
			params={'nodeId': doc['root']['nodeId']},
			session_id=cdp_session.session_id,
		)
		return result.get('outerHTML', '') if result else ''

	def keys(self, keys: str) -> None:
		"""Send keyboard keys."""
		self._run(self._keys_async(keys))

	async def _keys_async(self, keys: str) -> None:
		await self._actions.send_keys(keys)

	def back(self) -> None:
		"""Go back in history."""
		self._run(self._back_async())

	async def _back_async(self) -> None:
		await self._actions.go_back()

	def wait(self, seconds: float) -> None:
		"""Wait for specified seconds."""
		import time

		time.sleep(seconds)

	def extract(self, query: str) -> Any:
		"""Extract data using LLM (requires API key)."""
		# This would need LLM integration
		raise NotImplementedError('extract() requires LLM integration - use agent.run() instead')