mirror of
https://github.com/open-webui/open-webui.git
synced 2026-04-25 17:15:16 +02:00
feat: add PaddleOCR-vl loader support and implement retrieval router infrastructure (#23945)
Co-authored-by: Tim Baek <tim@openwebui.com> Co-authored-by: joaoback <156559121+joaoback@users.noreply.github.com>
This commit is contained in:
@@ -47,7 +47,7 @@ For more information, be sure to check out our [Open WebUI Documentation](https:
|
||||
|
||||
- 💾 **Persistent Artifact Storage**: Built-in key-value storage API for artifacts, enabling features like journals, trackers, leaderboards, and collaborative tools with both personal and shared data scopes across sessions.
|
||||
|
||||
- 📚 **Local RAG Integration**: Dive into the future of chat interactions with groundbreaking Retrieval Augmented Generation (RAG) support using your choice of 9 vector databases and multiple content extraction engines (Tika, Docling, Document Intelligence, Mistral OCR, External loaders). Load documents directly into chat or add files to your document library, effortlessly accessing them using the `#` command before a query.
|
||||
- 📚 **Local RAG Integration**: Dive into the future of chat interactions with groundbreaking Retrieval Augmented Generation (RAG) support using your choice of 9 vector databases and multiple content extraction engines (Tika, Docling, Document Intelligence, Mistral OCR, PaddleOCR-vl, External loaders). Load documents directly into chat or add files to your document library, effortlessly accessing them using the `#` command before a query.
|
||||
|
||||
- 🔍 **Web Search for RAG**: Perform web searches using 15+ providers including `SearXNG`, `Google PSE`, `Brave Search`, `Kagi`, `Mojeek`, `Tavily`, `Perplexity`, `serpstack`, `serper`, `Serply`, `DuckDuckGo`, `SearchApi`, `SerpApi`, `Bing`, `Jina`, `Exa`, `Sougou`, `Azure AI Search`, and `Ollama Cloud`, injecting results directly into your chat experience.
|
||||
|
||||
|
||||
@@ -2827,6 +2827,18 @@ MISTRAL_OCR_API_KEY = PersistentConfig(
|
||||
os.getenv('MISTRAL_OCR_API_KEY', ''),
|
||||
)
|
||||
|
||||
PADDLEOCR_VL_BASE_URL = PersistentConfig(
|
||||
'PADDLEOCR_VL_BASE_URL',
|
||||
'rag.paddleocr_vl_base_url',
|
||||
os.getenv('PADDLEOCR_VL_BASE_URL', 'http://localhost:8080'),
|
||||
)
|
||||
|
||||
PADDLEOCR_VL_TOKEN = PersistentConfig(
|
||||
'PADDLEOCR_VL_TOKEN',
|
||||
'rag.paddleocr_vl_token',
|
||||
os.getenv('PADDLEOCR_VL_TOKEN', ''),
|
||||
)
|
||||
|
||||
BYPASS_EMBEDDING_AND_RETRIEVAL = PersistentConfig(
|
||||
'BYPASS_EMBEDDING_AND_RETRIEVAL',
|
||||
'rag.bypass_embedding_and_retrieval',
|
||||
|
||||
@@ -303,6 +303,8 @@ from open_webui.config import (
|
||||
DOCUMENT_INTELLIGENCE_MODEL,
|
||||
MISTRAL_OCR_API_BASE_URL,
|
||||
MISTRAL_OCR_API_KEY,
|
||||
PADDLEOCR_VL_BASE_URL,
|
||||
PADDLEOCR_VL_TOKEN,
|
||||
RAG_TEXT_SPLITTER,
|
||||
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
|
||||
TIKTOKEN_ENCODING_NAME,
|
||||
@@ -1023,6 +1025,8 @@ app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
||||
app.state.config.DOCUMENT_INTELLIGENCE_MODEL = DOCUMENT_INTELLIGENCE_MODEL
|
||||
app.state.config.MISTRAL_OCR_API_BASE_URL = MISTRAL_OCR_API_BASE_URL
|
||||
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
|
||||
app.state.config.PADDLEOCR_VL_BASE_URL = PADDLEOCR_VL_BASE_URL
|
||||
app.state.config.PADDLEOCR_VL_TOKEN = PADDLEOCR_VL_TOKEN
|
||||
app.state.config.MINERU_API_MODE = MINERU_API_MODE
|
||||
app.state.config.MINERU_API_URL = MINERU_API_URL
|
||||
app.state.config.MINERU_API_KEY = MINERU_API_KEY
|
||||
|
||||
@@ -23,7 +23,7 @@ from open_webui.retrieval.loaders.external_document import ExternalDocumentLoade
|
||||
from open_webui.retrieval.loaders.mistral import MistralLoader
|
||||
from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader
|
||||
from open_webui.retrieval.loaders.mineru import MinerULoader
|
||||
|
||||
from open_webui.retrieval.loaders.paddleocr_vl import PaddleOCRVLLoader
|
||||
|
||||
from open_webui.env import GLOBAL_LOG_LEVEL, REQUESTS_VERIFY
|
||||
|
||||
@@ -399,6 +399,15 @@ class Loader:
|
||||
api_key=self.kwargs.get('MISTRAL_OCR_API_KEY'),
|
||||
file_path=file_path,
|
||||
)
|
||||
elif (
|
||||
self.engine == 'paddleocr_vl'
|
||||
and self.kwargs.get('PADDLEOCR_VL_TOKEN') != ''
|
||||
):
|
||||
loader = PaddleOCRVLLoader(
|
||||
api_url=self.kwargs.get('PADDLEOCR_VL_BASE_URL'),
|
||||
token=self.kwargs.get('PADDLEOCR_VL_TOKEN'),
|
||||
file_path=file_path,
|
||||
)
|
||||
else:
|
||||
if file_ext == 'pdf':
|
||||
loader = PyPDFLoader(
|
||||
|
||||
127
backend/open_webui/retrieval/loaders/paddleocr_vl.py
Normal file
127
backend/open_webui/retrieval/loaders/paddleocr_vl.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import base64
|
||||
import os
|
||||
import requests
|
||||
import logging
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from open_webui.env import GLOBAL_LOG_LEVEL
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
class PaddleOCRVLLoader:
|
||||
"""Loader that uses PaddleOCR-vl API to extract text from PDF/images."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_url: str,
|
||||
token: str,
|
||||
file_path: str,
|
||||
):
|
||||
if not api_url or not token:
|
||||
raise ValueError("PaddleOCR-vl API URL and Token are required.")
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found at {file_path}")
|
||||
|
||||
self.api_url = api_url.rstrip('/')
|
||||
self.token = token
|
||||
self.file_path = file_path
|
||||
self.file_name = os.path.basename(file_path)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
log.info(f"Processing with PaddleOCR-vl: {self.file_path}")
|
||||
|
||||
try:
|
||||
with open(self.file_path, "rb") as file:
|
||||
file_bytes = file.read()
|
||||
file_data = base64.b64encode(file_bytes).decode("ascii")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to read file {self.file_path}: {e}")
|
||||
raise
|
||||
|
||||
headers = {
|
||||
"Authorization": f"token {self.token}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Detect fileType based on file extension
|
||||
ext = self.file_path.lower().split('.')[-1]
|
||||
image_extensions = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'webp']
|
||||
file_type = 1 if ext in image_extensions else 0
|
||||
|
||||
payload = {
|
||||
"file": file_data,
|
||||
"fileType": file_type,
|
||||
"useDocOrientationClassify": False,
|
||||
"useDocUnwarping": False,
|
||||
"useChartRecognition": False,
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(f"{self.api_url}/layout-parsing", json=payload, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json().get("result", {})
|
||||
layout_results = result.get("layoutParsingResults", [])
|
||||
|
||||
documents = []
|
||||
total_pages = len(layout_results)
|
||||
skipped_pages = 0
|
||||
|
||||
for i, res in enumerate(layout_results):
|
||||
markdown_text = res.get("markdown", {}).get("text", "")
|
||||
|
||||
if isinstance(markdown_text, str):
|
||||
cleaned_content = markdown_text.strip()
|
||||
else:
|
||||
cleaned_content = str(markdown_text).strip()
|
||||
|
||||
if not cleaned_content:
|
||||
skipped_pages += 1
|
||||
continue
|
||||
|
||||
documents.append(
|
||||
Document(
|
||||
page_content=cleaned_content,
|
||||
metadata={
|
||||
"page": i,
|
||||
"page_label": i + 1,
|
||||
"total_pages": total_pages,
|
||||
"file_name": self.file_name,
|
||||
"processing_engine": "paddleocr-vl"
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
if skipped_pages > 0:
|
||||
log.info(f"PaddleOCR-vl: Processed {len(documents)} pages, skipped {skipped_pages} empty pages.")
|
||||
|
||||
if not documents:
|
||||
log.warning("No valid text content found by PaddleOCR-vl.")
|
||||
return [
|
||||
Document(
|
||||
page_content="No valid text content found in document",
|
||||
metadata={
|
||||
"error": "no_valid_pages",
|
||||
"file_name": self.file_name,
|
||||
"processing_engine": "paddleocr-vl"
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
return documents
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error calling PaddleOCR-vl: {e}")
|
||||
return [
|
||||
Document(
|
||||
page_content=f"Error during OCR processing: {e}",
|
||||
metadata={
|
||||
"error": "processing_failed",
|
||||
"file_name": self.file_name,
|
||||
"processing_engine": "paddleocr-vl"
|
||||
}
|
||||
)
|
||||
]
|
||||
@@ -114,6 +114,8 @@ def build_loader_from_config(request):
|
||||
DOCUMENT_INTELLIGENCE_MODEL=config.DOCUMENT_INTELLIGENCE_MODEL,
|
||||
MISTRAL_OCR_API_BASE_URL=config.MISTRAL_OCR_API_BASE_URL,
|
||||
MISTRAL_OCR_API_KEY=config.MISTRAL_OCR_API_KEY,
|
||||
PADDLEOCR_VL_BASE_URL=config.PADDLEOCR_VL_BASE_URL,
|
||||
PADDLEOCR_VL_TOKEN=config.PADDLEOCR_VL_TOKEN,
|
||||
MINERU_API_MODE=config.MINERU_API_MODE,
|
||||
MINERU_API_URL=config.MINERU_API_URL,
|
||||
MINERU_API_KEY=config.MINERU_API_KEY,
|
||||
|
||||
@@ -480,6 +480,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
|
||||
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
|
||||
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||
'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL,
|
||||
'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN,
|
||||
# MinerU settings
|
||||
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
|
||||
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,
|
||||
@@ -686,6 +688,8 @@ class ConfigForm(BaseModel):
|
||||
DOCUMENT_INTELLIGENCE_MODEL: Optional[str] = None
|
||||
MISTRAL_OCR_API_BASE_URL: Optional[str] = None
|
||||
MISTRAL_OCR_API_KEY: Optional[str] = None
|
||||
PADDLEOCR_VL_BASE_URL: Optional[str] = None
|
||||
PADDLEOCR_VL_TOKEN: Optional[str] = None
|
||||
|
||||
# MinerU settings
|
||||
MINERU_API_MODE: Optional[str] = None
|
||||
@@ -887,6 +891,16 @@ async def update_rag_config(request: Request, form_data: ConfigForm, user=Depend
|
||||
if form_data.MISTRAL_OCR_API_KEY is not None
|
||||
else request.app.state.config.MISTRAL_OCR_API_KEY
|
||||
)
|
||||
request.app.state.config.PADDLEOCR_VL_BASE_URL = (
|
||||
form_data.PADDLEOCR_VL_BASE_URL
|
||||
if form_data.PADDLEOCR_VL_BASE_URL is not None
|
||||
else request.app.state.config.PADDLEOCR_VL_BASE_URL
|
||||
)
|
||||
request.app.state.config.PADDLEOCR_VL_TOKEN = (
|
||||
form_data.PADDLEOCR_VL_TOKEN
|
||||
if form_data.PADDLEOCR_VL_TOKEN is not None
|
||||
else request.app.state.config.PADDLEOCR_VL_TOKEN
|
||||
)
|
||||
|
||||
# MinerU settings
|
||||
request.app.state.config.MINERU_API_MODE = (
|
||||
@@ -1152,6 +1166,8 @@ async def update_rag_config(request: Request, form_data: ConfigForm, user=Depend
|
||||
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
|
||||
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
|
||||
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||
'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL,
|
||||
'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN,
|
||||
# MinerU settings
|
||||
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
|
||||
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,
|
||||
|
||||
@@ -184,6 +184,13 @@
|
||||
toast.error($i18n.t('Mistral OCR API Key required.'));
|
||||
return;
|
||||
}
|
||||
if (
|
||||
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'paddleocr_vl' &&
|
||||
RAGConfig.PADDLEOCR_VL_BASE_URL === ''
|
||||
) {
|
||||
toast.error($i18n.t('PaddleOCR-vl API URL required.'));
|
||||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru' &&
|
||||
@@ -356,6 +363,7 @@
|
||||
<option value="datalab_marker">{$i18n.t('Datalab Marker API')}</option>
|
||||
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
|
||||
<option value="mistral_ocr">{$i18n.t('Mistral OCR')}</option>
|
||||
<option value="paddleocr_vl">{$i18n.t('PaddleOCR-vl')}</option>
|
||||
<option value="mineru">{$i18n.t('MinerU')}</option>
|
||||
</select>
|
||||
</div>
|
||||
@@ -657,6 +665,19 @@
|
||||
bind:value={RAGConfig.MISTRAL_OCR_API_KEY}
|
||||
/>
|
||||
</div>
|
||||
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'paddleocr_vl'}
|
||||
<div class="my-0.5 flex gap-2 pr-2">
|
||||
<input
|
||||
class="flex-1 w-full text-sm bg-transparent outline-hidden"
|
||||
placeholder={$i18n.t('Enter PaddleOCR-vl API Base URL')}
|
||||
bind:value={RAGConfig.PADDLEOCR_VL_BASE_URL}
|
||||
/>
|
||||
<SensitiveInput
|
||||
placeholder={$i18n.t('Enter PaddleOCR-vl API Token')}
|
||||
bind:value={RAGConfig.PADDLEOCR_VL_TOKEN}
|
||||
required={false}
|
||||
/>
|
||||
</div>
|
||||
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru'}
|
||||
<!-- API Mode Selection -->
|
||||
<div class="flex w-full mt-2">
|
||||
|
||||
@@ -775,6 +775,8 @@
|
||||
"Enter prompt here.": "",
|
||||
"Enter proxy URL (e.g. https://user:password@host:port)": "",
|
||||
"Enter reasoning effort": "",
|
||||
"Enter PaddleOCR-vl API Token": "",
|
||||
"Enter PaddleOCR-vl API Base URL": "",
|
||||
"Enter Score": "",
|
||||
"Enter SearchApi API Key": "",
|
||||
"Enter SearchApi Engine": "",
|
||||
@@ -1518,6 +1520,7 @@
|
||||
"Output format": "",
|
||||
"Output Format": "",
|
||||
"Overview": "",
|
||||
"PaddleOCR-vl": "",
|
||||
"page": "",
|
||||
"Page": "",
|
||||
"Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.": "",
|
||||
|
||||
@@ -774,6 +774,8 @@
|
||||
"Enter prompt here.": "在此输入提示词。",
|
||||
"Enter proxy URL (e.g. https://user:password@host:port)": "输入代理地址(例如:https://用户名:密码@主机名:端口)",
|
||||
"Enter reasoning effort": "输入推理努力",
|
||||
"Enter PaddleOCR-vl API Token": "输入 PaddleOCR-vl 接口密钥",
|
||||
"Enter PaddleOCR-vl API Base URL": "输入 PaddleOCR-vl API 基础地址",
|
||||
"Enter Score": "输入评分",
|
||||
"Enter SearchApi API Key": "输入 SearchApi 接口密钥",
|
||||
"Enter SearchApi Engine": "输入 SearchApi 引擎",
|
||||
@@ -1517,6 +1519,7 @@
|
||||
"Output format": "输出格式",
|
||||
"Output Format": "输出格式",
|
||||
"Overview": "概述",
|
||||
"PaddleOCR-vl": "PaddleOCR-vl",
|
||||
"page": "页",
|
||||
"Page": "页模式",
|
||||
"Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.": "页模式将为每个页面创建一个文档;单文档模式则将所有页面合并为一个文档,以便更好地进行跨页分块。",
|
||||
|
||||
Reference in New Issue
Block a user