refactor(firecrawl): use v2 API directly (#23934)

Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
RomualdYT
2026-04-24 11:32:08 +02:00
committed by GitHub
parent b1bd3084f0
commit e0d6074cd2
5 changed files with 215 additions and 78 deletions

View File

@@ -1,52 +1,229 @@
from __future__ import annotations
import logging
from typing import Optional, List
import time
from typing import TYPE_CHECKING, Any
import requests
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
from langchain_core.documents import Document
if TYPE_CHECKING:
from open_webui.retrieval.web.main import SearchResult
log = logging.getLogger(__name__)
DEFAULT_FIRECRAWL_API_BASE_URL = 'https://api.firecrawl.dev'
FIRECRAWL_RETRY_STATUS_CODES = {429, 500, 502, 503, 504}
FIRECRAWL_MAX_RETRIES = 2
def build_firecrawl_url(base_url: str | None, path: str) -> str:
base_url = (base_url or DEFAULT_FIRECRAWL_API_BASE_URL).rstrip('/')
path = path.lstrip('/')
if base_url.endswith('/v2'):
return f'{base_url}/{path}'
return f'{base_url}/v2/{path}'
def build_firecrawl_headers(api_key: str | None) -> dict[str, str]:
return {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key or ""}',
}
def get_firecrawl_timeout_seconds(timeout: Any) -> float | None:
if timeout in (None, ''):
return None
try:
timeout = float(timeout)
except (TypeError, ValueError):
return None
return timeout if timeout > 0 else None
def get_firecrawl_scrape_timeout_ms(timeout: Any) -> int | None:
timeout_seconds = get_firecrawl_timeout_seconds(timeout)
if timeout_seconds is None:
return None
# Firecrawl v2 expects scrape timeouts in milliseconds.
return min(300000, max(1000, int(timeout_seconds * 1000)))
def get_firecrawl_client_timeout_seconds(timeout: Any, fallback: float = 60) -> float:
# Keep the local HTTP timeout slightly above Firecrawl's scrape timeout.
return (get_firecrawl_timeout_seconds(timeout) or fallback) + 10
def get_firecrawl_retry_delay(headers: Any, attempt: int) -> float:
retry_after = headers.get('Retry-After') if headers else None
if retry_after:
try:
return min(10.0, max(0.0, float(retry_after)))
except (TypeError, ValueError):
pass
return min(8.0, float(2**attempt))
def request_firecrawl_json(
method: str,
url: str,
*,
headers: dict[str, str],
json: dict[str, Any] | None = None,
timeout: float | None = None,
verify: bool = True,
) -> dict[str, Any]:
last_error = None
for attempt in range(FIRECRAWL_MAX_RETRIES + 1):
try:
response = requests.request(
method,
url,
headers=headers,
json=json,
timeout=timeout,
verify=verify,
)
if response.status_code in FIRECRAWL_RETRY_STATUS_CODES and attempt < FIRECRAWL_MAX_RETRIES:
delay = get_firecrawl_retry_delay(response.headers, attempt)
log.warning(
'Firecrawl %s %s returned HTTP %s; retrying in %.1fs',
method,
url,
response.status_code,
delay,
)
time.sleep(delay)
continue
response.raise_for_status()
return response.json()
except (requests.ConnectionError, requests.Timeout) as e:
last_error = e
if attempt >= FIRECRAWL_MAX_RETRIES:
break
delay = get_firecrawl_retry_delay(None, attempt)
log.warning('Firecrawl %s %s failed; retrying in %.1fs: %s', method, url, delay, e)
time.sleep(delay)
if last_error:
raise last_error
raise RuntimeError(f'Firecrawl {method} {url} failed without a response')
def get_firecrawl_result_url(result: dict[str, Any]) -> str:
metadata = result.get('metadata') or {}
return (
result.get('url')
or result.get('link')
or metadata.get('url')
or metadata.get('sourceURL')
or metadata.get('source_url')
or ''
)
def scrape_firecrawl_url(
firecrawl_url: str,
firecrawl_api_key: str,
url: str,
*,
verify_ssl: bool = True,
timeout: Any = None,
params: dict[str, Any] | None = None,
) -> Document | None:
payload = {
'url': url,
'formats': ['markdown'],
'skipTlsVerification': not verify_ssl,
'removeBase64Images': True,
**(params or {}),
}
scrape_timeout_ms = get_firecrawl_scrape_timeout_ms(timeout)
if scrape_timeout_ms is not None:
payload['timeout'] = scrape_timeout_ms
response = request_firecrawl_json(
'POST',
build_firecrawl_url(firecrawl_url, 'scrape'),
headers=build_firecrawl_headers(firecrawl_api_key),
json=payload,
timeout=get_firecrawl_client_timeout_seconds(timeout),
verify=verify_ssl,
)
data = response.get('data') or {}
content = data.get('markdown') or ''
if not isinstance(content, str) or not content.strip():
return None
metadata = data.get('metadata') or {}
document_metadata = {'source': get_firecrawl_result_url(data) or url}
if metadata.get('title'):
document_metadata['title'] = metadata['title']
if metadata.get('description'):
document_metadata['description'] = metadata['description']
return Document(page_content=content, metadata=document_metadata)
def search_firecrawl(
firecrawl_url: str,
firecrawl_api_key: str,
query: str,
count: int,
filter_list: Optional[List[str]] = None,
) -> List[SearchResult]:
filter_list: list[str] | None = None,
) -> list[SearchResult]:
try:
url = firecrawl_url.rstrip('/')
response = requests.post(
f'{url}/v1/search',
headers={
'Content-Type': 'application/json',
'Authorization': f'Bearer {firecrawl_api_key}',
},
response = request_firecrawl_json(
'POST',
build_firecrawl_url(firecrawl_url, 'search'),
headers=build_firecrawl_headers(firecrawl_api_key),
json={
'query': query,
'limit': count,
'timeout': count * 3000,
'ignoreInvalidURLs': True,
},
timeout=count * 3 + 10,
)
response.raise_for_status()
data = response.json().get('data', [])
results = [
SearchResult(
link=r.get('url', ''),
title=r.get('title', ''),
snippet=r.get('description', ''),
)
for r in (data if isinstance(data, list) else [])
]
data = response.get('data') or {}
results = data.get('web') or []
if filter_list:
from open_webui.retrieval.web.main import get_filtered_results
results = get_filtered_results(results, filter_list)
results = results[:count]
log.info(f'FireCrawl search results: {results}')
return results
from open_webui.retrieval.web.main import SearchResult
search_results = []
for result in results[:count]:
url = get_firecrawl_result_url(result)
if not url:
continue
metadata = result.get('metadata') or {}
search_results.append(
SearchResult(
link=url,
title=result.get('title') or metadata.get('title'),
snippet=result.get('description') or result.get('snippet') or metadata.get('description'),
)
)
log.info(f'FireCrawl search results: {search_results}')
return search_results
except Exception as e:
log.error(f'Error in FireCrawl search: {e}')
return []

View File

@@ -30,6 +30,7 @@ from langchain_core.documents import Document
from open_webui.retrieval.loaders.tavily import TavilyLoader
from open_webui.retrieval.loaders.external_web import ExternalWebLoader
from open_webui.retrieval.web.firecrawl import scrape_firecrawl_url
from open_webui.constants import ERROR_MESSAGES
from open_webui.config import (
ENABLE_RAG_LOCAL_WEB_FETCH,
@@ -218,39 +219,20 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
def lazy_load(self) -> Iterator[Document]:
try:
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}',
}
for url in self.web_paths:
payload = {
'url': url,
'formats': ['markdown'],
**self.params,
}
if self.timeout:
payload['timeout'] = self.timeout * 1000
response = requests.post(
f'{self.api_url}/v1/scrape',
headers=headers,
json=payload,
timeout=self.timeout or 60,
verify=self.verify_ssl,
)
response.raise_for_status()
data = response.json().get('data', {})
metadata = data.get('metadata', {})
source = metadata.get('url') or metadata.get('sourceURL') or url
yield Document(
page_content=data.get('markdown', ''),
metadata={'source': source},
doc = scrape_firecrawl_url(
self.api_url,
self.api_key,
url,
verify_ssl=self.verify_ssl,
timeout=self.timeout,
params=self.params,
)
if doc is not None:
yield doc
except Exception as e:
if self.continue_on_failure:
log.exception(f'Error extracting content from URLs: {e}')
log.warning(f'Error extracting content from URLs with Firecrawl: {e}')
else:
raise e
@@ -261,7 +243,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
yield doc
except Exception as e:
if self.continue_on_failure:
log.exception(f'Error extracting content from URLs: {e}')
log.warning(f'Error extracting content from URLs with Firecrawl: {e}')
else:
raise e

View File

@@ -145,9 +145,6 @@ pytest-docker~=3.2.5
## LDAP
ldap3==2.9.1
## Firecrawl
firecrawl-py==4.18.0
## Trace
opentelemetry-api==1.40.0
opentelemetry-sdk==1.40.0

View File

@@ -167,7 +167,6 @@ all = [
"oracledb==3.4.2",
"colbert-ai==0.2.22",
"firecrawl-py==4.18.0",
"azure-search-documents==11.6.0",
"unstructured==0.18.31",
]

18
uv.lock generated
View File

@@ -1133,22 +1133,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
]
[[package]]
name = "firecrawl-py"
version = "1.12.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nest-asyncio" },
{ name = "pydantic" },
{ name = "python-dotenv" },
{ name = "requests" },
{ name = "websockets" },
]
sdist = { url = "https://files.pythonhosted.org/packages/18/db/e4f8ef9f0475b91b7c16a15e02fe19069d443cc5516cdefa2f9a0924a9a3/firecrawl_py-1.12.0.tar.gz", hash = "sha256:bbf883f6c774f05a5426121b85978a5f7b5ab11e614aff609f0673b097c3e553", size = 19655, upload-time = "2025-02-13T15:40:15.745Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/cc/d8/301d829099082c606ed16ed2a9acd263c47a365d471b9636435bf5d858b3/firecrawl_py-1.12.0-py3-none-any.whl", hash = "sha256:2b9c549315027da32421aca2a7ca597cb05cdbb968cfe0a89f389c7bb20afa4a", size = 31854, upload-time = "2025-02-13T15:40:14.492Z" },
]
[[package]]
name = "flask"
version = "3.1.0"
@@ -2692,7 +2676,6 @@ dependencies = [
{ name = "fake-useragent" },
{ name = "fastapi" },
{ name = "faster-whisper" },
{ name = "firecrawl-py" },
{ name = "fpdf2" },
{ name = "ftfy" },
{ name = "gcp-storage-emulator" },
@@ -2803,7 +2786,6 @@ requires-dist = [
{ name = "fake-useragent", specifier = "==2.1.0" },
{ name = "fastapi", specifier = "==0.115.7" },
{ name = "faster-whisper", specifier = "==1.1.1" },
{ name = "firecrawl-py", specifier = "==1.12.0" },
{ name = "fpdf2", specifier = "==2.8.2" },
{ name = "ftfy", specifier = "==6.2.3" },
{ name = "gcp-storage-emulator", specifier = ">=2024.8.3" },