mirror of
https://github.com/open-webui/open-webui.git
synced 2026-04-25 17:15:16 +02:00
refactor(firecrawl): use v2 API directly (#23934)
Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
@@ -1,52 +1,229 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import requests
|
||||
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
|
||||
from langchain_core.documents import Document
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from open_webui.retrieval.web.main import SearchResult
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_FIRECRAWL_API_BASE_URL = 'https://api.firecrawl.dev'
|
||||
FIRECRAWL_RETRY_STATUS_CODES = {429, 500, 502, 503, 504}
|
||||
FIRECRAWL_MAX_RETRIES = 2
|
||||
|
||||
|
||||
def build_firecrawl_url(base_url: str | None, path: str) -> str:
|
||||
base_url = (base_url or DEFAULT_FIRECRAWL_API_BASE_URL).rstrip('/')
|
||||
path = path.lstrip('/')
|
||||
|
||||
if base_url.endswith('/v2'):
|
||||
return f'{base_url}/{path}'
|
||||
|
||||
return f'{base_url}/v2/{path}'
|
||||
|
||||
|
||||
def build_firecrawl_headers(api_key: str | None) -> dict[str, str]:
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {api_key or ""}',
|
||||
}
|
||||
|
||||
|
||||
def get_firecrawl_timeout_seconds(timeout: Any) -> float | None:
|
||||
if timeout in (None, ''):
|
||||
return None
|
||||
|
||||
try:
|
||||
timeout = float(timeout)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
return timeout if timeout > 0 else None
|
||||
|
||||
|
||||
def get_firecrawl_scrape_timeout_ms(timeout: Any) -> int | None:
|
||||
timeout_seconds = get_firecrawl_timeout_seconds(timeout)
|
||||
if timeout_seconds is None:
|
||||
return None
|
||||
|
||||
# Firecrawl v2 expects scrape timeouts in milliseconds.
|
||||
return min(300000, max(1000, int(timeout_seconds * 1000)))
|
||||
|
||||
|
||||
def get_firecrawl_client_timeout_seconds(timeout: Any, fallback: float = 60) -> float:
|
||||
# Keep the local HTTP timeout slightly above Firecrawl's scrape timeout.
|
||||
return (get_firecrawl_timeout_seconds(timeout) or fallback) + 10
|
||||
|
||||
|
||||
def get_firecrawl_retry_delay(headers: Any, attempt: int) -> float:
|
||||
retry_after = headers.get('Retry-After') if headers else None
|
||||
if retry_after:
|
||||
try:
|
||||
return min(10.0, max(0.0, float(retry_after)))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
return min(8.0, float(2**attempt))
|
||||
|
||||
|
||||
def request_firecrawl_json(
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
headers: dict[str, str],
|
||||
json: dict[str, Any] | None = None,
|
||||
timeout: float | None = None,
|
||||
verify: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
last_error = None
|
||||
|
||||
for attempt in range(FIRECRAWL_MAX_RETRIES + 1):
|
||||
try:
|
||||
response = requests.request(
|
||||
method,
|
||||
url,
|
||||
headers=headers,
|
||||
json=json,
|
||||
timeout=timeout,
|
||||
verify=verify,
|
||||
)
|
||||
|
||||
if response.status_code in FIRECRAWL_RETRY_STATUS_CODES and attempt < FIRECRAWL_MAX_RETRIES:
|
||||
delay = get_firecrawl_retry_delay(response.headers, attempt)
|
||||
log.warning(
|
||||
'Firecrawl %s %s returned HTTP %s; retrying in %.1fs',
|
||||
method,
|
||||
url,
|
||||
response.status_code,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except (requests.ConnectionError, requests.Timeout) as e:
|
||||
last_error = e
|
||||
if attempt >= FIRECRAWL_MAX_RETRIES:
|
||||
break
|
||||
|
||||
delay = get_firecrawl_retry_delay(None, attempt)
|
||||
log.warning('Firecrawl %s %s failed; retrying in %.1fs: %s', method, url, delay, e)
|
||||
time.sleep(delay)
|
||||
|
||||
if last_error:
|
||||
raise last_error
|
||||
|
||||
raise RuntimeError(f'Firecrawl {method} {url} failed without a response')
|
||||
|
||||
|
||||
def get_firecrawl_result_url(result: dict[str, Any]) -> str:
|
||||
metadata = result.get('metadata') or {}
|
||||
return (
|
||||
result.get('url')
|
||||
or result.get('link')
|
||||
or metadata.get('url')
|
||||
or metadata.get('sourceURL')
|
||||
or metadata.get('source_url')
|
||||
or ''
|
||||
)
|
||||
|
||||
|
||||
def scrape_firecrawl_url(
|
||||
firecrawl_url: str,
|
||||
firecrawl_api_key: str,
|
||||
url: str,
|
||||
*,
|
||||
verify_ssl: bool = True,
|
||||
timeout: Any = None,
|
||||
params: dict[str, Any] | None = None,
|
||||
) -> Document | None:
|
||||
payload = {
|
||||
'url': url,
|
||||
'formats': ['markdown'],
|
||||
'skipTlsVerification': not verify_ssl,
|
||||
'removeBase64Images': True,
|
||||
**(params or {}),
|
||||
}
|
||||
scrape_timeout_ms = get_firecrawl_scrape_timeout_ms(timeout)
|
||||
if scrape_timeout_ms is not None:
|
||||
payload['timeout'] = scrape_timeout_ms
|
||||
|
||||
response = request_firecrawl_json(
|
||||
'POST',
|
||||
build_firecrawl_url(firecrawl_url, 'scrape'),
|
||||
headers=build_firecrawl_headers(firecrawl_api_key),
|
||||
json=payload,
|
||||
timeout=get_firecrawl_client_timeout_seconds(timeout),
|
||||
verify=verify_ssl,
|
||||
)
|
||||
data = response.get('data') or {}
|
||||
content = data.get('markdown') or ''
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
return None
|
||||
|
||||
metadata = data.get('metadata') or {}
|
||||
document_metadata = {'source': get_firecrawl_result_url(data) or url}
|
||||
if metadata.get('title'):
|
||||
document_metadata['title'] = metadata['title']
|
||||
if metadata.get('description'):
|
||||
document_metadata['description'] = metadata['description']
|
||||
|
||||
return Document(page_content=content, metadata=document_metadata)
|
||||
|
||||
|
||||
def search_firecrawl(
|
||||
firecrawl_url: str,
|
||||
firecrawl_api_key: str,
|
||||
query: str,
|
||||
count: int,
|
||||
filter_list: Optional[List[str]] = None,
|
||||
) -> List[SearchResult]:
|
||||
filter_list: list[str] | None = None,
|
||||
) -> list[SearchResult]:
|
||||
try:
|
||||
url = firecrawl_url.rstrip('/')
|
||||
response = requests.post(
|
||||
f'{url}/v1/search',
|
||||
headers={
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {firecrawl_api_key}',
|
||||
},
|
||||
response = request_firecrawl_json(
|
||||
'POST',
|
||||
build_firecrawl_url(firecrawl_url, 'search'),
|
||||
headers=build_firecrawl_headers(firecrawl_api_key),
|
||||
json={
|
||||
'query': query,
|
||||
'limit': count,
|
||||
'timeout': count * 3000,
|
||||
'ignoreInvalidURLs': True,
|
||||
},
|
||||
timeout=count * 3 + 10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json().get('data', [])
|
||||
|
||||
results = [
|
||||
SearchResult(
|
||||
link=r.get('url', ''),
|
||||
title=r.get('title', ''),
|
||||
snippet=r.get('description', ''),
|
||||
)
|
||||
for r in (data if isinstance(data, list) else [])
|
||||
]
|
||||
data = response.get('data') or {}
|
||||
results = data.get('web') or []
|
||||
|
||||
if filter_list:
|
||||
from open_webui.retrieval.web.main import get_filtered_results
|
||||
|
||||
results = get_filtered_results(results, filter_list)
|
||||
|
||||
results = results[:count]
|
||||
log.info(f'FireCrawl search results: {results}')
|
||||
return results
|
||||
from open_webui.retrieval.web.main import SearchResult
|
||||
|
||||
search_results = []
|
||||
for result in results[:count]:
|
||||
url = get_firecrawl_result_url(result)
|
||||
if not url:
|
||||
continue
|
||||
|
||||
metadata = result.get('metadata') or {}
|
||||
search_results.append(
|
||||
SearchResult(
|
||||
link=url,
|
||||
title=result.get('title') or metadata.get('title'),
|
||||
snippet=result.get('description') or result.get('snippet') or metadata.get('description'),
|
||||
)
|
||||
)
|
||||
|
||||
log.info(f'FireCrawl search results: {search_results}')
|
||||
return search_results
|
||||
except Exception as e:
|
||||
log.error(f'Error in FireCrawl search: {e}')
|
||||
return []
|
||||
|
||||
@@ -30,6 +30,7 @@ from langchain_core.documents import Document
|
||||
|
||||
from open_webui.retrieval.loaders.tavily import TavilyLoader
|
||||
from open_webui.retrieval.loaders.external_web import ExternalWebLoader
|
||||
from open_webui.retrieval.web.firecrawl import scrape_firecrawl_url
|
||||
from open_webui.constants import ERROR_MESSAGES
|
||||
from open_webui.config import (
|
||||
ENABLE_RAG_LOCAL_WEB_FETCH,
|
||||
@@ -218,39 +219,20 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
try:
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
}
|
||||
|
||||
for url in self.web_paths:
|
||||
payload = {
|
||||
'url': url,
|
||||
'formats': ['markdown'],
|
||||
**self.params,
|
||||
}
|
||||
if self.timeout:
|
||||
payload['timeout'] = self.timeout * 1000
|
||||
|
||||
response = requests.post(
|
||||
f'{self.api_url}/v1/scrape',
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=self.timeout or 60,
|
||||
verify=self.verify_ssl,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json().get('data', {})
|
||||
metadata = data.get('metadata', {})
|
||||
source = metadata.get('url') or metadata.get('sourceURL') or url
|
||||
|
||||
yield Document(
|
||||
page_content=data.get('markdown', ''),
|
||||
metadata={'source': source},
|
||||
doc = scrape_firecrawl_url(
|
||||
self.api_url,
|
||||
self.api_key,
|
||||
url,
|
||||
verify_ssl=self.verify_ssl,
|
||||
timeout=self.timeout,
|
||||
params=self.params,
|
||||
)
|
||||
if doc is not None:
|
||||
yield doc
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
log.exception(f'Error extracting content from URLs: {e}')
|
||||
log.warning(f'Error extracting content from URLs with Firecrawl: {e}')
|
||||
else:
|
||||
raise e
|
||||
|
||||
@@ -261,7 +243,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
||||
yield doc
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
log.exception(f'Error extracting content from URLs: {e}')
|
||||
log.warning(f'Error extracting content from URLs with Firecrawl: {e}')
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
@@ -145,9 +145,6 @@ pytest-docker~=3.2.5
|
||||
## LDAP
|
||||
ldap3==2.9.1
|
||||
|
||||
## Firecrawl
|
||||
firecrawl-py==4.18.0
|
||||
|
||||
## Trace
|
||||
opentelemetry-api==1.40.0
|
||||
opentelemetry-sdk==1.40.0
|
||||
|
||||
@@ -167,7 +167,6 @@ all = [
|
||||
"oracledb==3.4.2",
|
||||
"colbert-ai==0.2.22",
|
||||
|
||||
"firecrawl-py==4.18.0",
|
||||
"azure-search-documents==11.6.0",
|
||||
"unstructured==0.18.31",
|
||||
]
|
||||
|
||||
20
uv.lock
generated
20
uv.lock
generated
@@ -1133,22 +1133,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "firecrawl-py"
|
||||
version = "1.12.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "nest-asyncio" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "requests" },
|
||||
{ name = "websockets" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/18/db/e4f8ef9f0475b91b7c16a15e02fe19069d443cc5516cdefa2f9a0924a9a3/firecrawl_py-1.12.0.tar.gz", hash = "sha256:bbf883f6c774f05a5426121b85978a5f7b5ab11e614aff609f0673b097c3e553", size = 19655, upload-time = "2025-02-13T15:40:15.745Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/d8/301d829099082c606ed16ed2a9acd263c47a365d471b9636435bf5d858b3/firecrawl_py-1.12.0-py3-none-any.whl", hash = "sha256:2b9c549315027da32421aca2a7ca597cb05cdbb968cfe0a89f389c7bb20afa4a", size = 31854, upload-time = "2025-02-13T15:40:14.492Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flask"
|
||||
version = "3.1.0"
|
||||
@@ -2692,7 +2676,6 @@ dependencies = [
|
||||
{ name = "fake-useragent" },
|
||||
{ name = "fastapi" },
|
||||
{ name = "faster-whisper" },
|
||||
{ name = "firecrawl-py" },
|
||||
{ name = "fpdf2" },
|
||||
{ name = "ftfy" },
|
||||
{ name = "gcp-storage-emulator" },
|
||||
@@ -2803,7 +2786,6 @@ requires-dist = [
|
||||
{ name = "fake-useragent", specifier = "==2.1.0" },
|
||||
{ name = "fastapi", specifier = "==0.115.7" },
|
||||
{ name = "faster-whisper", specifier = "==1.1.1" },
|
||||
{ name = "firecrawl-py", specifier = "==1.12.0" },
|
||||
{ name = "fpdf2", specifier = "==2.8.2" },
|
||||
{ name = "ftfy", specifier = "==6.2.3" },
|
||||
{ name = "gcp-storage-emulator", specifier = ">=2024.8.3" },
|
||||
@@ -5321,4 +5303,4 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/61/ac78a1263bc83a5cf29e7458b77a568eda5a8f81980691bbc6eb6a0d45cc/zstandard-0.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35", size = 5191313, upload-time = "2024-07-15T00:16:09.758Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e7/54/967c478314e16af5baf849b6ee9d6ea724ae5b100eb506011f045d3d4e16/zstandard-0.23.0-cp312-cp312-win32.whl", hash = "sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d", size = 430877, upload-time = "2024-07-15T00:16:11.758Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/75/37/872d74bd7739639c4553bf94c84af7d54d8211b626b352bc57f0fd8d1e3f/zstandard-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b", size = 495595, upload-time = "2024-07-15T00:16:13.731Z" },
|
||||
]
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user