diff --git a/backend/open_webui/retrieval/web/firecrawl.py b/backend/open_webui/retrieval/web/firecrawl.py index 8cd18e1ef2..4bbd4f212b 100644 --- a/backend/open_webui/retrieval/web/firecrawl.py +++ b/backend/open_webui/retrieval/web/firecrawl.py @@ -1,52 +1,229 @@ +from __future__ import annotations + import logging -from typing import Optional, List +import time +from typing import TYPE_CHECKING, Any import requests -from open_webui.retrieval.web.main import SearchResult, get_filtered_results +from langchain_core.documents import Document + +if TYPE_CHECKING: + from open_webui.retrieval.web.main import SearchResult log = logging.getLogger(__name__) +DEFAULT_FIRECRAWL_API_BASE_URL = 'https://api.firecrawl.dev' +FIRECRAWL_RETRY_STATUS_CODES = {429, 500, 502, 503, 504} +FIRECRAWL_MAX_RETRIES = 2 + + +def build_firecrawl_url(base_url: str | None, path: str) -> str: + base_url = (base_url or DEFAULT_FIRECRAWL_API_BASE_URL).rstrip('/') + path = path.lstrip('/') + + if base_url.endswith('/v2'): + return f'{base_url}/{path}' + + return f'{base_url}/v2/{path}' + + +def build_firecrawl_headers(api_key: str | None) -> dict[str, str]: + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key or ""}', + } + + +def get_firecrawl_timeout_seconds(timeout: Any) -> float | None: + if timeout in (None, ''): + return None + + try: + timeout = float(timeout) + except (TypeError, ValueError): + return None + + return timeout if timeout > 0 else None + + +def get_firecrawl_scrape_timeout_ms(timeout: Any) -> int | None: + timeout_seconds = get_firecrawl_timeout_seconds(timeout) + if timeout_seconds is None: + return None + + # Firecrawl v2 expects scrape timeouts in milliseconds. + return min(300000, max(1000, int(timeout_seconds * 1000))) + + +def get_firecrawl_client_timeout_seconds(timeout: Any, fallback: float = 60) -> float: + # Keep the local HTTP timeout slightly above Firecrawl's scrape timeout. + return (get_firecrawl_timeout_seconds(timeout) or fallback) + 10 + + +def get_firecrawl_retry_delay(headers: Any, attempt: int) -> float: + retry_after = headers.get('Retry-After') if headers else None + if retry_after: + try: + return min(10.0, max(0.0, float(retry_after))) + except (TypeError, ValueError): + pass + + return min(8.0, float(2**attempt)) + + +def request_firecrawl_json( + method: str, + url: str, + *, + headers: dict[str, str], + json: dict[str, Any] | None = None, + timeout: float | None = None, + verify: bool = True, +) -> dict[str, Any]: + last_error = None + + for attempt in range(FIRECRAWL_MAX_RETRIES + 1): + try: + response = requests.request( + method, + url, + headers=headers, + json=json, + timeout=timeout, + verify=verify, + ) + + if response.status_code in FIRECRAWL_RETRY_STATUS_CODES and attempt < FIRECRAWL_MAX_RETRIES: + delay = get_firecrawl_retry_delay(response.headers, attempt) + log.warning( + 'Firecrawl %s %s returned HTTP %s; retrying in %.1fs', + method, + url, + response.status_code, + delay, + ) + time.sleep(delay) + continue + + response.raise_for_status() + return response.json() + except (requests.ConnectionError, requests.Timeout) as e: + last_error = e + if attempt >= FIRECRAWL_MAX_RETRIES: + break + + delay = get_firecrawl_retry_delay(None, attempt) + log.warning('Firecrawl %s %s failed; retrying in %.1fs: %s', method, url, delay, e) + time.sleep(delay) + + if last_error: + raise last_error + + raise RuntimeError(f'Firecrawl {method} {url} failed without a response') + + +def get_firecrawl_result_url(result: dict[str, Any]) -> str: + metadata = result.get('metadata') or {} + return ( + result.get('url') + or result.get('link') + or metadata.get('url') + or metadata.get('sourceURL') + or metadata.get('source_url') + or '' + ) + + +def scrape_firecrawl_url( + firecrawl_url: str, + firecrawl_api_key: str, + url: str, + *, + verify_ssl: bool = True, + timeout: Any = None, + params: dict[str, Any] | None = None, +) -> Document | None: + payload = { + 'url': url, + 'formats': ['markdown'], + 'skipTlsVerification': not verify_ssl, + 'removeBase64Images': True, + **(params or {}), + } + scrape_timeout_ms = get_firecrawl_scrape_timeout_ms(timeout) + if scrape_timeout_ms is not None: + payload['timeout'] = scrape_timeout_ms + + response = request_firecrawl_json( + 'POST', + build_firecrawl_url(firecrawl_url, 'scrape'), + headers=build_firecrawl_headers(firecrawl_api_key), + json=payload, + timeout=get_firecrawl_client_timeout_seconds(timeout), + verify=verify_ssl, + ) + data = response.get('data') or {} + content = data.get('markdown') or '' + if not isinstance(content, str) or not content.strip(): + return None + + metadata = data.get('metadata') or {} + document_metadata = {'source': get_firecrawl_result_url(data) or url} + if metadata.get('title'): + document_metadata['title'] = metadata['title'] + if metadata.get('description'): + document_metadata['description'] = metadata['description'] + + return Document(page_content=content, metadata=document_metadata) + def search_firecrawl( firecrawl_url: str, firecrawl_api_key: str, query: str, count: int, - filter_list: Optional[List[str]] = None, -) -> List[SearchResult]: + filter_list: list[str] | None = None, +) -> list[SearchResult]: try: - url = firecrawl_url.rstrip('/') - response = requests.post( - f'{url}/v1/search', - headers={ - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {firecrawl_api_key}', - }, + response = request_firecrawl_json( + 'POST', + build_firecrawl_url(firecrawl_url, 'search'), + headers=build_firecrawl_headers(firecrawl_api_key), json={ 'query': query, 'limit': count, 'timeout': count * 3000, + 'ignoreInvalidURLs': True, }, timeout=count * 3 + 10, ) - response.raise_for_status() - data = response.json().get('data', []) - - results = [ - SearchResult( - link=r.get('url', ''), - title=r.get('title', ''), - snippet=r.get('description', ''), - ) - for r in (data if isinstance(data, list) else []) - ] + data = response.get('data') or {} + results = data.get('web') or [] if filter_list: + from open_webui.retrieval.web.main import get_filtered_results + results = get_filtered_results(results, filter_list) - results = results[:count] - log.info(f'FireCrawl search results: {results}') - return results + from open_webui.retrieval.web.main import SearchResult + + search_results = [] + for result in results[:count]: + url = get_firecrawl_result_url(result) + if not url: + continue + + metadata = result.get('metadata') or {} + search_results.append( + SearchResult( + link=url, + title=result.get('title') or metadata.get('title'), + snippet=result.get('description') or result.get('snippet') or metadata.get('description'), + ) + ) + + log.info(f'FireCrawl search results: {search_results}') + return search_results except Exception as e: log.error(f'Error in FireCrawl search: {e}') return [] diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 9cb0c1abd7..6ee0e3781a 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -30,6 +30,7 @@ from langchain_core.documents import Document from open_webui.retrieval.loaders.tavily import TavilyLoader from open_webui.retrieval.loaders.external_web import ExternalWebLoader +from open_webui.retrieval.web.firecrawl import scrape_firecrawl_url from open_webui.constants import ERROR_MESSAGES from open_webui.config import ( ENABLE_RAG_LOCAL_WEB_FETCH, @@ -218,39 +219,20 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin): def lazy_load(self) -> Iterator[Document]: try: - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - } - for url in self.web_paths: - payload = { - 'url': url, - 'formats': ['markdown'], - **self.params, - } - if self.timeout: - payload['timeout'] = self.timeout * 1000 - - response = requests.post( - f'{self.api_url}/v1/scrape', - headers=headers, - json=payload, - timeout=self.timeout or 60, - verify=self.verify_ssl, - ) - response.raise_for_status() - data = response.json().get('data', {}) - metadata = data.get('metadata', {}) - source = metadata.get('url') or metadata.get('sourceURL') or url - - yield Document( - page_content=data.get('markdown', ''), - metadata={'source': source}, + doc = scrape_firecrawl_url( + self.api_url, + self.api_key, + url, + verify_ssl=self.verify_ssl, + timeout=self.timeout, + params=self.params, ) + if doc is not None: + yield doc except Exception as e: if self.continue_on_failure: - log.exception(f'Error extracting content from URLs: {e}') + log.warning(f'Error extracting content from URLs with Firecrawl: {e}') else: raise e @@ -261,7 +243,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin): yield doc except Exception as e: if self.continue_on_failure: - log.exception(f'Error extracting content from URLs: {e}') + log.warning(f'Error extracting content from URLs with Firecrawl: {e}') else: raise e diff --git a/backend/requirements.txt b/backend/requirements.txt index 77b87324cf..a7d2b1cb53 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -145,9 +145,6 @@ pytest-docker~=3.2.5 ## LDAP ldap3==2.9.1 -## Firecrawl -firecrawl-py==4.18.0 - ## Trace opentelemetry-api==1.40.0 opentelemetry-sdk==1.40.0 diff --git a/pyproject.toml b/pyproject.toml index af6084dd08..8d8fc8a755 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,7 +167,6 @@ all = [ "oracledb==3.4.2", "colbert-ai==0.2.22", - "firecrawl-py==4.18.0", "azure-search-documents==11.6.0", "unstructured==0.18.31", ] diff --git a/uv.lock b/uv.lock index 7bde0eeb01..8f610937f5 100644 --- a/uv.lock +++ b/uv.lock @@ -1133,22 +1133,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, ] -[[package]] -name = "firecrawl-py" -version = "1.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nest-asyncio" }, - { name = "pydantic" }, - { name = "python-dotenv" }, - { name = "requests" }, - { name = "websockets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/18/db/e4f8ef9f0475b91b7c16a15e02fe19069d443cc5516cdefa2f9a0924a9a3/firecrawl_py-1.12.0.tar.gz", hash = "sha256:bbf883f6c774f05a5426121b85978a5f7b5ab11e614aff609f0673b097c3e553", size = 19655, upload-time = "2025-02-13T15:40:15.745Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/d8/301d829099082c606ed16ed2a9acd263c47a365d471b9636435bf5d858b3/firecrawl_py-1.12.0-py3-none-any.whl", hash = "sha256:2b9c549315027da32421aca2a7ca597cb05cdbb968cfe0a89f389c7bb20afa4a", size = 31854, upload-time = "2025-02-13T15:40:14.492Z" }, -] - [[package]] name = "flask" version = "3.1.0" @@ -2692,7 +2676,6 @@ dependencies = [ { name = "fake-useragent" }, { name = "fastapi" }, { name = "faster-whisper" }, - { name = "firecrawl-py" }, { name = "fpdf2" }, { name = "ftfy" }, { name = "gcp-storage-emulator" }, @@ -2803,7 +2786,6 @@ requires-dist = [ { name = "fake-useragent", specifier = "==2.1.0" }, { name = "fastapi", specifier = "==0.115.7" }, { name = "faster-whisper", specifier = "==1.1.1" }, - { name = "firecrawl-py", specifier = "==1.12.0" }, { name = "fpdf2", specifier = "==2.8.2" }, { name = "ftfy", specifier = "==6.2.3" }, { name = "gcp-storage-emulator", specifier = ">=2024.8.3" }, @@ -5321,4 +5303,4 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f2/61/ac78a1263bc83a5cf29e7458b77a568eda5a8f81980691bbc6eb6a0d45cc/zstandard-0.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35", size = 5191313, upload-time = "2024-07-15T00:16:09.758Z" }, { url = "https://files.pythonhosted.org/packages/e7/54/967c478314e16af5baf849b6ee9d6ea724ae5b100eb506011f045d3d4e16/zstandard-0.23.0-cp312-cp312-win32.whl", hash = "sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d", size = 430877, upload-time = "2024-07-15T00:16:11.758Z" }, { url = "https://files.pythonhosted.org/packages/75/37/872d74bd7739639c4553bf94c84af7d54d8211b626b352bc57f0fd8d1e3f/zstandard-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b", size = 495595, upload-time = "2024-07-15T00:16:13.731Z" }, -] \ No newline at end of file +]