mirror of
https://github.com/browser-use/browser-use
synced 2026-04-22 17:45:09 +02:00
fix(cdp): timeout-wrap CDPClient.send_raw to break silent WebSocket hangs
cdp_use.CDPClient.send_raw awaits a future that only resolves when the browser sends a response with a matching message id. There is no timeout on that await. Against the cloud browser service, the failure mode we observed is: WebSocket stays alive at the TCP/keepalive layer (proxy keeps pong-ing our pings), but the browser upstream is dead / unhealthy and never sends any CDP response. send_raw's future never resolves, and every higher-level timeout in browser-use (session.start's 15s connect guard, agent.step_timeout, tools.act's action timeout) relies on eventually getting a response — so they all wait forever too. Evidence from a 170k-task collector run: 1,090 empty-history traces, 100% hit the 240s outer watchdog, median duration 582s, max 2214s, with cloud HTTP layer clean throughout (all 200/201). One sample showed /json/version returning 200 OK and then 5 minutes of total silence on the WebSocket before forced stop — classic silent-hang. Fix: add TimeoutWrappedCDPClient, a thin subclass of cdp_use.CDPClient that wraps send_raw in asyncio.wait_for(timeout=cdp_request_timeout_s). Any CDP method that doesn't respond within the cap raises plain TimeoutError, which propagates through existing `except TimeoutError` handlers in session.py / tools/service.py. Uses the same defensive env parse pattern as BROWSER_USE_ACTION_TIMEOUT_S — rejects empty / non-numeric / nan / inf / non-positive values with a warning fallback. Default is 60s: generous for slow operations like Page.captureScreenshot or Page.printToPDF on heavy pages, but well below the 180s step timeout and any typical outer watchdog. Override via BROWSER_USE_CDP_TIMEOUT_S. Wired into both CDPClient construction sites in session.py (initial connect + reconnect path). All 17 existing real-browser tests (test_action_blank_page, test_multi_act_guards) still pass.
This commit is contained in:
107
browser_use/browser/_cdp_timeout.py
Normal file
107
browser_use/browser/_cdp_timeout.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""Per-CDP-request timeout wrapper around cdp_use.CDPClient.
|
||||
|
||||
cdp_use's `send_raw()` awaits a future that only resolves when the browser
|
||||
sends a matching response. If the server goes silent mid-session (observed
|
||||
failure mode against remote cloud browsers: WebSocket stays "alive" at the
|
||||
TCP/keepalive layer while the browser container is dead or the proxy has
|
||||
lost its upstream) the future never resolves and the whole agent hangs.
|
||||
|
||||
This module provides a thin subclass that wraps each `send_raw()` in
|
||||
`asyncio.wait_for`. Any CDP method that doesn't get a response within the
|
||||
cap raises `TimeoutError`, which propagates through existing
|
||||
error-handling paths in browser-use instead of hanging indefinitely.
|
||||
|
||||
Configure the cap via:
|
||||
- `BROWSER_USE_CDP_TIMEOUT_S` env var (process-wide default)
|
||||
- `TimeoutWrappedCDPClient(..., cdp_request_timeout_s=...)` constructor arg
|
||||
|
||||
Default (60s) is generous for slow operations like `Page.captureScreenshot`
|
||||
or `Page.printToPDF` on heavy pages, but well below the 180s agent step
|
||||
timeout and the typical outer agent watchdog.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from cdp_use import CDPClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_CDP_TIMEOUT_FALLBACK_S = 60.0
|
||||
|
||||
|
||||
def _parse_env_cdp_timeout(raw: str | None) -> float:
|
||||
"""Parse BROWSER_USE_CDP_TIMEOUT_S defensively.
|
||||
|
||||
Accepts only finite positive values; everything else falls back to the
|
||||
hardcoded default with a warning. Mirrors the guard on
|
||||
BROWSER_USE_ACTION_TIMEOUT_S in tools/service.py — a bad env value here
|
||||
would otherwise make every CDP call time out immediately (nan) or never
|
||||
(inf / negative / zero).
|
||||
"""
|
||||
if raw is None or raw == '':
|
||||
return _CDP_TIMEOUT_FALLBACK_S
|
||||
try:
|
||||
parsed = float(raw)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
'Invalid BROWSER_USE_CDP_TIMEOUT_S=%r; falling back to %.0fs',
|
||||
raw,
|
||||
_CDP_TIMEOUT_FALLBACK_S,
|
||||
)
|
||||
return _CDP_TIMEOUT_FALLBACK_S
|
||||
if not math.isfinite(parsed) or parsed <= 0:
|
||||
logger.warning(
|
||||
'BROWSER_USE_CDP_TIMEOUT_S=%r is not a finite positive number; falling back to %.0fs',
|
||||
raw,
|
||||
_CDP_TIMEOUT_FALLBACK_S,
|
||||
)
|
||||
return _CDP_TIMEOUT_FALLBACK_S
|
||||
return parsed
|
||||
|
||||
|
||||
DEFAULT_CDP_REQUEST_TIMEOUT_S: float = _parse_env_cdp_timeout(os.getenv('BROWSER_USE_CDP_TIMEOUT_S'))
|
||||
|
||||
|
||||
class TimeoutWrappedCDPClient(CDPClient):
|
||||
"""CDPClient subclass that enforces a per-request timeout on send_raw.
|
||||
|
||||
Any CDP method that doesn't receive a response within `cdp_request_timeout_s`
|
||||
raises `TimeoutError` instead of hanging forever. This turns silent-hang
|
||||
failure modes (cloud proxy alive, browser dead) into fast observable errors.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
cdp_request_timeout_s: float | None = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self._cdp_request_timeout_s: float = (
|
||||
cdp_request_timeout_s if cdp_request_timeout_s is not None else DEFAULT_CDP_REQUEST_TIMEOUT_S
|
||||
)
|
||||
|
||||
async def send_raw(
|
||||
self,
|
||||
method: str,
|
||||
params: Any | None = None,
|
||||
session_id: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
super().send_raw(method=method, params=params, session_id=session_id),
|
||||
timeout=self._cdp_request_timeout_s,
|
||||
)
|
||||
except TimeoutError as e:
|
||||
# Raise a plain TimeoutError so existing `except TimeoutError`
|
||||
# handlers in browser-use / tools treat this uniformly.
|
||||
raise TimeoutError(
|
||||
f'CDP method {method!r} did not respond within {self._cdp_request_timeout_s:.0f}s. '
|
||||
f'The browser may be unresponsive (silent WebSocket — container crashed or proxy lost upstream).'
|
||||
) from e
|
||||
@@ -20,6 +20,7 @@ from cdp_use.cdp.target.commands import CreateTargetParameters
|
||||
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
||||
from uuid_extensions import uuid7str
|
||||
|
||||
from browser_use.browser._cdp_timeout import TimeoutWrappedCDPClient
|
||||
from browser_use.browser.cloud.cloud import CloudBrowserAuthError, CloudBrowserClient, CloudBrowserError
|
||||
|
||||
# CDP logging is now handled by setup_logging() in logging_config.py
|
||||
@@ -1770,7 +1771,7 @@ class BrowserSession(BaseModel):
|
||||
from browser_use.utils import get_browser_use_version
|
||||
|
||||
headers.setdefault('User-Agent', f'browser-use/{get_browser_use_version()}')
|
||||
self._cdp_client_root = CDPClient(
|
||||
self._cdp_client_root = TimeoutWrappedCDPClient(
|
||||
self.cdp_url,
|
||||
additional_headers=headers or None,
|
||||
max_ws_frame_size=200 * 1024 * 1024, # Use 200MB limit to handle pages with very large DOMs
|
||||
@@ -2068,7 +2069,7 @@ class BrowserSession(BaseModel):
|
||||
from browser_use.utils import get_browser_use_version
|
||||
|
||||
headers.setdefault('User-Agent', f'browser-use/{get_browser_use_version()}')
|
||||
self._cdp_client_root = CDPClient(
|
||||
self._cdp_client_root = TimeoutWrappedCDPClient(
|
||||
self.cdp_url,
|
||||
additional_headers=headers or None,
|
||||
max_ws_frame_size=200 * 1024 * 1024,
|
||||
|
||||
84
tests/ci/test_cdp_timeout.py
Normal file
84
tests/ci/test_cdp_timeout.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Regression tests for TimeoutWrappedCDPClient.
|
||||
|
||||
cdp_use.CDPClient.send_raw awaits a future that only resolves when the browser
|
||||
sends a matching response. When the server goes silent (observed against cloud
|
||||
browsers whose WebSocket stays connected at TCP/keepalive layer but never
|
||||
replies), send_raw hangs forever. The wrapper turns that hang into a fast
|
||||
TimeoutError.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from browser_use.browser._cdp_timeout import (
|
||||
DEFAULT_CDP_REQUEST_TIMEOUT_S,
|
||||
TimeoutWrappedCDPClient,
|
||||
_parse_env_cdp_timeout,
|
||||
)
|
||||
|
||||
|
||||
class _HangingClient(TimeoutWrappedCDPClient):
|
||||
"""Wrapper whose parent send_raw never returns — simulates a silent server."""
|
||||
|
||||
def __init__(self, cdp_request_timeout_s: float) -> None:
|
||||
# Skip real CDPClient.__init__ — we only exercise the timeout wrapper.
|
||||
self._cdp_request_timeout_s = cdp_request_timeout_s
|
||||
self.call_count = 0
|
||||
|
||||
async def _parent_send_raw(self, *_args, **_kwargs):
|
||||
self.call_count += 1
|
||||
await asyncio.sleep(30) # Way longer than any test timeout.
|
||||
return {}
|
||||
|
||||
async def send_raw(self, method, params=None, session_id=None):
|
||||
# Inline version of TimeoutWrappedCDPClient.send_raw using our _parent_send_raw
|
||||
# (avoids needing a real WebSocket).
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
self._parent_send_raw(method=method, params=params, session_id=session_id),
|
||||
timeout=self._cdp_request_timeout_s,
|
||||
)
|
||||
except TimeoutError as e:
|
||||
raise TimeoutError(f'CDP method {method!r} did not respond within {self._cdp_request_timeout_s:.0f}s.') from e
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_raw_times_out_on_silent_server():
|
||||
"""A CDP method that gets no response must raise TimeoutError within the cap."""
|
||||
client = _HangingClient(cdp_request_timeout_s=0.5)
|
||||
|
||||
start = time.monotonic()
|
||||
with pytest.raises(TimeoutError) as exc:
|
||||
await client.send_raw('Target.getTargets')
|
||||
elapsed = time.monotonic() - start
|
||||
|
||||
assert client.call_count == 1
|
||||
# Returned within the cap (plus a small scheduling margin), not after the
|
||||
# full 30s sleep.
|
||||
assert elapsed < 2.0, f'wrapper did not enforce timeout; took {elapsed:.2f}s'
|
||||
assert 'Target.getTargets' in str(exc.value)
|
||||
assert '0s' in str(exc.value) or '1s' in str(exc.value)
|
||||
|
||||
|
||||
def test_default_cdp_timeout_is_reasonable():
|
||||
"""Default must give headroom above typical slow CDP calls but stay below
|
||||
the 180s agent step_timeout so hangs surface before step-level kills."""
|
||||
assert 10.0 <= DEFAULT_CDP_REQUEST_TIMEOUT_S <= 120.0, (
|
||||
f'Default CDP timeout ({DEFAULT_CDP_REQUEST_TIMEOUT_S}s) is outside the sensible 10–120s range'
|
||||
)
|
||||
|
||||
|
||||
def test_parse_env_rejects_malformed_values():
|
||||
"""Mirrors the defensive parse used for BROWSER_USE_ACTION_TIMEOUT_S."""
|
||||
for bad in ('', 'nan', 'NaN', 'inf', '-inf', '0', '-5', 'abc'):
|
||||
assert _parse_env_cdp_timeout(bad) == 60.0, f'Expected fallback for {bad!r}'
|
||||
|
||||
# Finite positive values take effect.
|
||||
assert _parse_env_cdp_timeout('30') == 30.0
|
||||
assert _parse_env_cdp_timeout('15.5') == 15.5
|
||||
# None (env var not set) also falls back.
|
||||
assert _parse_env_cdp_timeout(None) == 60.0
|
||||
Reference in New Issue
Block a user