feat: add PaddleOCR-vl loader support and implement retrieval router infrastructure (#23945)

Co-authored-by: Tim Baek <tim@openwebui.com>
Co-authored-by: joaoback <156559121+joaoback@users.noreply.github.com>
This commit is contained in:
goodbey857
2026-04-24 14:19:37 +08:00
committed by GitHub
parent 0e311a95a7
commit 58bc254809
10 changed files with 199 additions and 2 deletions

View File

@@ -47,7 +47,7 @@ For more information, be sure to check out our [Open WebUI Documentation](https:
- 💾 **Persistent Artifact Storage**: Built-in key-value storage API for artifacts, enabling features like journals, trackers, leaderboards, and collaborative tools with both personal and shared data scopes across sessions.
- 📚 **Local RAG Integration**: Dive into the future of chat interactions with groundbreaking Retrieval Augmented Generation (RAG) support using your choice of 9 vector databases and multiple content extraction engines (Tika, Docling, Document Intelligence, Mistral OCR, External loaders). Load documents directly into chat or add files to your document library, effortlessly accessing them using the `#` command before a query.
- 📚 **Local RAG Integration**: Dive into the future of chat interactions with groundbreaking Retrieval Augmented Generation (RAG) support using your choice of 9 vector databases and multiple content extraction engines (Tika, Docling, Document Intelligence, Mistral OCR, PaddleOCR-vl, External loaders). Load documents directly into chat or add files to your document library, effortlessly accessing them using the `#` command before a query.
- 🔍 **Web Search for RAG**: Perform web searches using 15+ providers including `SearXNG`, `Google PSE`, `Brave Search`, `Kagi`, `Mojeek`, `Tavily`, `Perplexity`, `serpstack`, `serper`, `Serply`, `DuckDuckGo`, `SearchApi`, `SerpApi`, `Bing`, `Jina`, `Exa`, `Sougou`, `Azure AI Search`, and `Ollama Cloud`, injecting results directly into your chat experience.

View File

@@ -2827,6 +2827,18 @@ MISTRAL_OCR_API_KEY = PersistentConfig(
os.getenv('MISTRAL_OCR_API_KEY', ''),
)
PADDLEOCR_VL_BASE_URL = PersistentConfig(
'PADDLEOCR_VL_BASE_URL',
'rag.paddleocr_vl_base_url',
os.getenv('PADDLEOCR_VL_BASE_URL', 'http://localhost:8080'),
)
PADDLEOCR_VL_TOKEN = PersistentConfig(
'PADDLEOCR_VL_TOKEN',
'rag.paddleocr_vl_token',
os.getenv('PADDLEOCR_VL_TOKEN', ''),
)
BYPASS_EMBEDDING_AND_RETRIEVAL = PersistentConfig(
'BYPASS_EMBEDDING_AND_RETRIEVAL',
'rag.bypass_embedding_and_retrieval',

View File

@@ -303,6 +303,8 @@ from open_webui.config import (
DOCUMENT_INTELLIGENCE_MODEL,
MISTRAL_OCR_API_BASE_URL,
MISTRAL_OCR_API_KEY,
PADDLEOCR_VL_BASE_URL,
PADDLEOCR_VL_TOKEN,
RAG_TEXT_SPLITTER,
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
TIKTOKEN_ENCODING_NAME,
@@ -1023,6 +1025,8 @@ app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
app.state.config.DOCUMENT_INTELLIGENCE_MODEL = DOCUMENT_INTELLIGENCE_MODEL
app.state.config.MISTRAL_OCR_API_BASE_URL = MISTRAL_OCR_API_BASE_URL
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
app.state.config.PADDLEOCR_VL_BASE_URL = PADDLEOCR_VL_BASE_URL
app.state.config.PADDLEOCR_VL_TOKEN = PADDLEOCR_VL_TOKEN
app.state.config.MINERU_API_MODE = MINERU_API_MODE
app.state.config.MINERU_API_URL = MINERU_API_URL
app.state.config.MINERU_API_KEY = MINERU_API_KEY

View File

@@ -23,7 +23,7 @@ from open_webui.retrieval.loaders.external_document import ExternalDocumentLoade
from open_webui.retrieval.loaders.mistral import MistralLoader
from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader
from open_webui.retrieval.loaders.mineru import MinerULoader
from open_webui.retrieval.loaders.paddleocr_vl import PaddleOCRVLLoader
from open_webui.env import GLOBAL_LOG_LEVEL, REQUESTS_VERIFY
@@ -399,6 +399,15 @@ class Loader:
api_key=self.kwargs.get('MISTRAL_OCR_API_KEY'),
file_path=file_path,
)
elif (
self.engine == 'paddleocr_vl'
and self.kwargs.get('PADDLEOCR_VL_TOKEN') != ''
):
loader = PaddleOCRVLLoader(
api_url=self.kwargs.get('PADDLEOCR_VL_BASE_URL'),
token=self.kwargs.get('PADDLEOCR_VL_TOKEN'),
file_path=file_path,
)
else:
if file_ext == 'pdf':
loader = PyPDFLoader(

View File

@@ -0,0 +1,127 @@
import base64
import os
import requests
import logging
import sys
from typing import List
from langchain_core.documents import Document
from open_webui.env import GLOBAL_LOG_LEVEL
logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL)
log = logging.getLogger(__name__)
class PaddleOCRVLLoader:
"""Loader that uses PaddleOCR-vl API to extract text from PDF/images."""
def __init__(
self,
api_url: str,
token: str,
file_path: str,
):
if not api_url or not token:
raise ValueError("PaddleOCR-vl API URL and Token are required.")
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found at {file_path}")
self.api_url = api_url.rstrip('/')
self.token = token
self.file_path = file_path
self.file_name = os.path.basename(file_path)
def load(self) -> List[Document]:
log.info(f"Processing with PaddleOCR-vl: {self.file_path}")
try:
with open(self.file_path, "rb") as file:
file_bytes = file.read()
file_data = base64.b64encode(file_bytes).decode("ascii")
except Exception as e:
log.error(f"Failed to read file {self.file_path}: {e}")
raise
headers = {
"Authorization": f"token {self.token}",
"Content-Type": "application/json"
}
# Detect fileType based on file extension
ext = self.file_path.lower().split('.')[-1]
image_extensions = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'webp']
file_type = 1 if ext in image_extensions else 0
payload = {
"file": file_data,
"fileType": file_type,
"useDocOrientationClassify": False,
"useDocUnwarping": False,
"useChartRecognition": False,
}
try:
response = requests.post(f"{self.api_url}/layout-parsing", json=payload, headers=headers)
response.raise_for_status()
result = response.json().get("result", {})
layout_results = result.get("layoutParsingResults", [])
documents = []
total_pages = len(layout_results)
skipped_pages = 0
for i, res in enumerate(layout_results):
markdown_text = res.get("markdown", {}).get("text", "")
if isinstance(markdown_text, str):
cleaned_content = markdown_text.strip()
else:
cleaned_content = str(markdown_text).strip()
if not cleaned_content:
skipped_pages += 1
continue
documents.append(
Document(
page_content=cleaned_content,
metadata={
"page": i,
"page_label": i + 1,
"total_pages": total_pages,
"file_name": self.file_name,
"processing_engine": "paddleocr-vl"
}
)
)
if skipped_pages > 0:
log.info(f"PaddleOCR-vl: Processed {len(documents)} pages, skipped {skipped_pages} empty pages.")
if not documents:
log.warning("No valid text content found by PaddleOCR-vl.")
return [
Document(
page_content="No valid text content found in document",
metadata={
"error": "no_valid_pages",
"file_name": self.file_name,
"processing_engine": "paddleocr-vl"
}
)
]
return documents
except Exception as e:
log.error(f"Error calling PaddleOCR-vl: {e}")
return [
Document(
page_content=f"Error during OCR processing: {e}",
metadata={
"error": "processing_failed",
"file_name": self.file_name,
"processing_engine": "paddleocr-vl"
}
)
]

View File

@@ -114,6 +114,8 @@ def build_loader_from_config(request):
DOCUMENT_INTELLIGENCE_MODEL=config.DOCUMENT_INTELLIGENCE_MODEL,
MISTRAL_OCR_API_BASE_URL=config.MISTRAL_OCR_API_BASE_URL,
MISTRAL_OCR_API_KEY=config.MISTRAL_OCR_API_KEY,
PADDLEOCR_VL_BASE_URL=config.PADDLEOCR_VL_BASE_URL,
PADDLEOCR_VL_TOKEN=config.PADDLEOCR_VL_TOKEN,
MINERU_API_MODE=config.MINERU_API_MODE,
MINERU_API_URL=config.MINERU_API_URL,
MINERU_API_KEY=config.MINERU_API_KEY,

View File

@@ -480,6 +480,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL,
'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN,
# MinerU settings
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,
@@ -686,6 +688,8 @@ class ConfigForm(BaseModel):
DOCUMENT_INTELLIGENCE_MODEL: Optional[str] = None
MISTRAL_OCR_API_BASE_URL: Optional[str] = None
MISTRAL_OCR_API_KEY: Optional[str] = None
PADDLEOCR_VL_BASE_URL: Optional[str] = None
PADDLEOCR_VL_TOKEN: Optional[str] = None
# MinerU settings
MINERU_API_MODE: Optional[str] = None
@@ -887,6 +891,16 @@ async def update_rag_config(request: Request, form_data: ConfigForm, user=Depend
if form_data.MISTRAL_OCR_API_KEY is not None
else request.app.state.config.MISTRAL_OCR_API_KEY
)
request.app.state.config.PADDLEOCR_VL_BASE_URL = (
form_data.PADDLEOCR_VL_BASE_URL
if form_data.PADDLEOCR_VL_BASE_URL is not None
else request.app.state.config.PADDLEOCR_VL_BASE_URL
)
request.app.state.config.PADDLEOCR_VL_TOKEN = (
form_data.PADDLEOCR_VL_TOKEN
if form_data.PADDLEOCR_VL_TOKEN is not None
else request.app.state.config.PADDLEOCR_VL_TOKEN
)
# MinerU settings
request.app.state.config.MINERU_API_MODE = (
@@ -1152,6 +1166,8 @@ async def update_rag_config(request: Request, form_data: ConfigForm, user=Depend
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL,
'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN,
# MinerU settings
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,

View File

@@ -184,6 +184,13 @@
toast.error($i18n.t('Mistral OCR API Key required.'));
return;
}
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'paddleocr_vl' &&
RAGConfig.PADDLEOCR_VL_BASE_URL === ''
) {
toast.error($i18n.t('PaddleOCR-vl API URL required.'));
return;
}
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru' &&
@@ -356,6 +363,7 @@
<option value="datalab_marker">{$i18n.t('Datalab Marker API')}</option>
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
<option value="mistral_ocr">{$i18n.t('Mistral OCR')}</option>
<option value="paddleocr_vl">{$i18n.t('PaddleOCR-vl')}</option>
<option value="mineru">{$i18n.t('MinerU')}</option>
</select>
</div>
@@ -657,6 +665,19 @@
bind:value={RAGConfig.MISTRAL_OCR_API_KEY}
/>
</div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'paddleocr_vl'}
<div class="my-0.5 flex gap-2 pr-2">
<input
class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter PaddleOCR-vl API Base URL')}
bind:value={RAGConfig.PADDLEOCR_VL_BASE_URL}
/>
<SensitiveInput
placeholder={$i18n.t('Enter PaddleOCR-vl API Token')}
bind:value={RAGConfig.PADDLEOCR_VL_TOKEN}
required={false}
/>
</div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru'}
<!-- API Mode Selection -->
<div class="flex w-full mt-2">

View File

@@ -775,6 +775,8 @@
"Enter prompt here.": "",
"Enter proxy URL (e.g. https://user:password@host:port)": "",
"Enter reasoning effort": "",
"Enter PaddleOCR-vl API Token": "",
"Enter PaddleOCR-vl API Base URL": "",
"Enter Score": "",
"Enter SearchApi API Key": "",
"Enter SearchApi Engine": "",
@@ -1518,6 +1520,7 @@
"Output format": "",
"Output Format": "",
"Overview": "",
"PaddleOCR-vl": "",
"page": "",
"Page": "",
"Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.": "",

View File

@@ -774,6 +774,8 @@
"Enter prompt here.": "在此输入提示词。",
"Enter proxy URL (e.g. https://user:password@host:port)": "输入代理地址例如https://用户名:密码@主机名:端口)",
"Enter reasoning effort": "输入推理努力",
"Enter PaddleOCR-vl API Token": "输入 PaddleOCR-vl 接口密钥",
"Enter PaddleOCR-vl API Base URL": "输入 PaddleOCR-vl API 基础地址",
"Enter Score": "输入评分",
"Enter SearchApi API Key": "输入 SearchApi 接口密钥",
"Enter SearchApi Engine": "输入 SearchApi 引擎",
@@ -1517,6 +1519,7 @@
"Output format": "输出格式",
"Output Format": "输出格式",
"Overview": "概述",
"PaddleOCR-vl": "PaddleOCR-vl",
"page": "页",
"Page": "页模式",
"Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.": "页模式将为每个页面创建一个文档;单文档模式则将所有页面合并为一个文档,以便更好地进行跨页分块。",