From 58bc254809bac2432f1af6927e2cf24e09707d51 Mon Sep 17 00:00:00 2001 From: goodbey857 <76645482+goodbey857@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:19:37 +0800 Subject: [PATCH] feat: add PaddleOCR-vl loader support and implement retrieval router infrastructure (#23945) Co-authored-by: Tim Baek Co-authored-by: joaoback <156559121+joaoback@users.noreply.github.com> --- README.md | 2 +- backend/open_webui/config.py | 12 ++ backend/open_webui/main.py | 4 + backend/open_webui/retrieval/loaders/main.py | 11 +- .../retrieval/loaders/paddleocr_vl.py | 127 ++++++++++++++++++ backend/open_webui/retrieval/utils.py | 2 + backend/open_webui/routers/retrieval.py | 16 +++ .../admin/Settings/Documents.svelte | 21 +++ src/lib/i18n/locales/en-US/translation.json | 3 + src/lib/i18n/locales/zh-CN/translation.json | 3 + 10 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 backend/open_webui/retrieval/loaders/paddleocr_vl.py diff --git a/README.md b/README.md index 1885f4f6f1..3c4bee98c9 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ For more information, be sure to check out our [Open WebUI Documentation](https: - 💾 **Persistent Artifact Storage**: Built-in key-value storage API for artifacts, enabling features like journals, trackers, leaderboards, and collaborative tools with both personal and shared data scopes across sessions. -- 📚 **Local RAG Integration**: Dive into the future of chat interactions with groundbreaking Retrieval Augmented Generation (RAG) support using your choice of 9 vector databases and multiple content extraction engines (Tika, Docling, Document Intelligence, Mistral OCR, External loaders). Load documents directly into chat or add files to your document library, effortlessly accessing them using the `#` command before a query. +- 📚 **Local RAG Integration**: Dive into the future of chat interactions with groundbreaking Retrieval Augmented Generation (RAG) support using your choice of 9 vector databases and multiple content extraction engines (Tika, Docling, Document Intelligence, Mistral OCR, PaddleOCR-vl, External loaders). Load documents directly into chat or add files to your document library, effortlessly accessing them using the `#` command before a query. - 🔍 **Web Search for RAG**: Perform web searches using 15+ providers including `SearXNG`, `Google PSE`, `Brave Search`, `Kagi`, `Mojeek`, `Tavily`, `Perplexity`, `serpstack`, `serper`, `Serply`, `DuckDuckGo`, `SearchApi`, `SerpApi`, `Bing`, `Jina`, `Exa`, `Sougou`, `Azure AI Search`, and `Ollama Cloud`, injecting results directly into your chat experience. diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index d2c88cb2fb..06178d385c 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2827,6 +2827,18 @@ MISTRAL_OCR_API_KEY = PersistentConfig( os.getenv('MISTRAL_OCR_API_KEY', ''), ) +PADDLEOCR_VL_BASE_URL = PersistentConfig( + 'PADDLEOCR_VL_BASE_URL', + 'rag.paddleocr_vl_base_url', + os.getenv('PADDLEOCR_VL_BASE_URL', 'http://localhost:8080'), +) + +PADDLEOCR_VL_TOKEN = PersistentConfig( + 'PADDLEOCR_VL_TOKEN', + 'rag.paddleocr_vl_token', + os.getenv('PADDLEOCR_VL_TOKEN', ''), +) + BYPASS_EMBEDDING_AND_RETRIEVAL = PersistentConfig( 'BYPASS_EMBEDDING_AND_RETRIEVAL', 'rag.bypass_embedding_and_retrieval', diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index ba7f74c830..d6f4f4c7af 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -303,6 +303,8 @@ from open_webui.config import ( DOCUMENT_INTELLIGENCE_MODEL, MISTRAL_OCR_API_BASE_URL, MISTRAL_OCR_API_KEY, + PADDLEOCR_VL_BASE_URL, + PADDLEOCR_VL_TOKEN, RAG_TEXT_SPLITTER, ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER, TIKTOKEN_ENCODING_NAME, @@ -1023,6 +1025,8 @@ app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.DOCUMENT_INTELLIGENCE_MODEL = DOCUMENT_INTELLIGENCE_MODEL app.state.config.MISTRAL_OCR_API_BASE_URL = MISTRAL_OCR_API_BASE_URL app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY +app.state.config.PADDLEOCR_VL_BASE_URL = PADDLEOCR_VL_BASE_URL +app.state.config.PADDLEOCR_VL_TOKEN = PADDLEOCR_VL_TOKEN app.state.config.MINERU_API_MODE = MINERU_API_MODE app.state.config.MINERU_API_URL = MINERU_API_URL app.state.config.MINERU_API_KEY = MINERU_API_KEY diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 7dc9df37ce..27c81f7f81 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -23,7 +23,7 @@ from open_webui.retrieval.loaders.external_document import ExternalDocumentLoade from open_webui.retrieval.loaders.mistral import MistralLoader from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader from open_webui.retrieval.loaders.mineru import MinerULoader - +from open_webui.retrieval.loaders.paddleocr_vl import PaddleOCRVLLoader from open_webui.env import GLOBAL_LOG_LEVEL, REQUESTS_VERIFY @@ -399,6 +399,15 @@ class Loader: api_key=self.kwargs.get('MISTRAL_OCR_API_KEY'), file_path=file_path, ) + elif ( + self.engine == 'paddleocr_vl' + and self.kwargs.get('PADDLEOCR_VL_TOKEN') != '' + ): + loader = PaddleOCRVLLoader( + api_url=self.kwargs.get('PADDLEOCR_VL_BASE_URL'), + token=self.kwargs.get('PADDLEOCR_VL_TOKEN'), + file_path=file_path, + ) else: if file_ext == 'pdf': loader = PyPDFLoader( diff --git a/backend/open_webui/retrieval/loaders/paddleocr_vl.py b/backend/open_webui/retrieval/loaders/paddleocr_vl.py new file mode 100644 index 0000000000..ab7632b3f8 --- /dev/null +++ b/backend/open_webui/retrieval/loaders/paddleocr_vl.py @@ -0,0 +1,127 @@ +import base64 +import os +import requests +import logging +import sys +from typing import List + +from langchain_core.documents import Document +from open_webui.env import GLOBAL_LOG_LEVEL + +logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL) +log = logging.getLogger(__name__) + +class PaddleOCRVLLoader: + """Loader that uses PaddleOCR-vl API to extract text from PDF/images.""" + + def __init__( + self, + api_url: str, + token: str, + file_path: str, + ): + if not api_url or not token: + raise ValueError("PaddleOCR-vl API URL and Token are required.") + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found at {file_path}") + + self.api_url = api_url.rstrip('/') + self.token = token + self.file_path = file_path + self.file_name = os.path.basename(file_path) + + def load(self) -> List[Document]: + log.info(f"Processing with PaddleOCR-vl: {self.file_path}") + + try: + with open(self.file_path, "rb") as file: + file_bytes = file.read() + file_data = base64.b64encode(file_bytes).decode("ascii") + except Exception as e: + log.error(f"Failed to read file {self.file_path}: {e}") + raise + + headers = { + "Authorization": f"token {self.token}", + "Content-Type": "application/json" + } + + # Detect fileType based on file extension + ext = self.file_path.lower().split('.')[-1] + image_extensions = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'webp'] + file_type = 1 if ext in image_extensions else 0 + + payload = { + "file": file_data, + "fileType": file_type, + "useDocOrientationClassify": False, + "useDocUnwarping": False, + "useChartRecognition": False, + } + + try: + response = requests.post(f"{self.api_url}/layout-parsing", json=payload, headers=headers) + response.raise_for_status() + + result = response.json().get("result", {}) + layout_results = result.get("layoutParsingResults", []) + + documents = [] + total_pages = len(layout_results) + skipped_pages = 0 + + for i, res in enumerate(layout_results): + markdown_text = res.get("markdown", {}).get("text", "") + + if isinstance(markdown_text, str): + cleaned_content = markdown_text.strip() + else: + cleaned_content = str(markdown_text).strip() + + if not cleaned_content: + skipped_pages += 1 + continue + + documents.append( + Document( + page_content=cleaned_content, + metadata={ + "page": i, + "page_label": i + 1, + "total_pages": total_pages, + "file_name": self.file_name, + "processing_engine": "paddleocr-vl" + } + ) + ) + + if skipped_pages > 0: + log.info(f"PaddleOCR-vl: Processed {len(documents)} pages, skipped {skipped_pages} empty pages.") + + if not documents: + log.warning("No valid text content found by PaddleOCR-vl.") + return [ + Document( + page_content="No valid text content found in document", + metadata={ + "error": "no_valid_pages", + "file_name": self.file_name, + "processing_engine": "paddleocr-vl" + } + ) + ] + + return documents + + except Exception as e: + log.error(f"Error calling PaddleOCR-vl: {e}") + return [ + Document( + page_content=f"Error during OCR processing: {e}", + metadata={ + "error": "processing_failed", + "file_name": self.file_name, + "processing_engine": "paddleocr-vl" + } + ) + ] diff --git a/backend/open_webui/retrieval/utils.py b/backend/open_webui/retrieval/utils.py index fb5a46c2b0..b1aec78656 100644 --- a/backend/open_webui/retrieval/utils.py +++ b/backend/open_webui/retrieval/utils.py @@ -114,6 +114,8 @@ def build_loader_from_config(request): DOCUMENT_INTELLIGENCE_MODEL=config.DOCUMENT_INTELLIGENCE_MODEL, MISTRAL_OCR_API_BASE_URL=config.MISTRAL_OCR_API_BASE_URL, MISTRAL_OCR_API_KEY=config.MISTRAL_OCR_API_KEY, + PADDLEOCR_VL_BASE_URL=config.PADDLEOCR_VL_BASE_URL, + PADDLEOCR_VL_TOKEN=config.PADDLEOCR_VL_TOKEN, MINERU_API_MODE=config.MINERU_API_MODE, MINERU_API_URL=config.MINERU_API_URL, MINERU_API_KEY=config.MINERU_API_KEY, diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index fea00143e6..01ef1d8886 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -480,6 +480,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): 'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL, 'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL, 'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY, + 'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL, + 'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN, # MinerU settings 'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE, 'MINERU_API_URL': request.app.state.config.MINERU_API_URL, @@ -686,6 +688,8 @@ class ConfigForm(BaseModel): DOCUMENT_INTELLIGENCE_MODEL: Optional[str] = None MISTRAL_OCR_API_BASE_URL: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None + PADDLEOCR_VL_BASE_URL: Optional[str] = None + PADDLEOCR_VL_TOKEN: Optional[str] = None # MinerU settings MINERU_API_MODE: Optional[str] = None @@ -887,6 +891,16 @@ async def update_rag_config(request: Request, form_data: ConfigForm, user=Depend if form_data.MISTRAL_OCR_API_KEY is not None else request.app.state.config.MISTRAL_OCR_API_KEY ) + request.app.state.config.PADDLEOCR_VL_BASE_URL = ( + form_data.PADDLEOCR_VL_BASE_URL + if form_data.PADDLEOCR_VL_BASE_URL is not None + else request.app.state.config.PADDLEOCR_VL_BASE_URL + ) + request.app.state.config.PADDLEOCR_VL_TOKEN = ( + form_data.PADDLEOCR_VL_TOKEN + if form_data.PADDLEOCR_VL_TOKEN is not None + else request.app.state.config.PADDLEOCR_VL_TOKEN + ) # MinerU settings request.app.state.config.MINERU_API_MODE = ( @@ -1152,6 +1166,8 @@ async def update_rag_config(request: Request, form_data: ConfigForm, user=Depend 'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL, 'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL, 'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY, + 'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL, + 'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN, # MinerU settings 'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE, 'MINERU_API_URL': request.app.state.config.MINERU_API_URL, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index eeb6b18b10..a2349e78e5 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -184,6 +184,13 @@ toast.error($i18n.t('Mistral OCR API Key required.')); return; } + if ( + RAGConfig.CONTENT_EXTRACTION_ENGINE === 'paddleocr_vl' && + RAGConfig.PADDLEOCR_VL_BASE_URL === '' + ) { + toast.error($i18n.t('PaddleOCR-vl API URL required.')); + return; + } if ( RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru' && @@ -356,6 +363,7 @@ + @@ -657,6 +665,19 @@ bind:value={RAGConfig.MISTRAL_OCR_API_KEY} /> + {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'paddleocr_vl'} +
+ + +
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru'}
diff --git a/src/lib/i18n/locales/en-US/translation.json b/src/lib/i18n/locales/en-US/translation.json index ad0f42f733..36ad93ad61 100644 --- a/src/lib/i18n/locales/en-US/translation.json +++ b/src/lib/i18n/locales/en-US/translation.json @@ -775,6 +775,8 @@ "Enter prompt here.": "", "Enter proxy URL (e.g. https://user:password@host:port)": "", "Enter reasoning effort": "", + "Enter PaddleOCR-vl API Token": "", + "Enter PaddleOCR-vl API Base URL": "", "Enter Score": "", "Enter SearchApi API Key": "", "Enter SearchApi Engine": "", @@ -1518,6 +1520,7 @@ "Output format": "", "Output Format": "", "Overview": "", + "PaddleOCR-vl": "", "page": "", "Page": "", "Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.": "", diff --git a/src/lib/i18n/locales/zh-CN/translation.json b/src/lib/i18n/locales/zh-CN/translation.json index ce09ad948c..9678d24eeb 100644 --- a/src/lib/i18n/locales/zh-CN/translation.json +++ b/src/lib/i18n/locales/zh-CN/translation.json @@ -774,6 +774,8 @@ "Enter prompt here.": "在此输入提示词。", "Enter proxy URL (e.g. https://user:password@host:port)": "输入代理地址(例如:https://用户名:密码@主机名:端口)", "Enter reasoning effort": "输入推理努力", + "Enter PaddleOCR-vl API Token": "输入 PaddleOCR-vl 接口密钥", + "Enter PaddleOCR-vl API Base URL": "输入 PaddleOCR-vl API 基础地址", "Enter Score": "输入评分", "Enter SearchApi API Key": "输入 SearchApi 接口密钥", "Enter SearchApi Engine": "输入 SearchApi 引擎", @@ -1517,6 +1519,7 @@ "Output format": "输出格式", "Output Format": "输出格式", "Overview": "概述", + "PaddleOCR-vl": "PaddleOCR-vl", "page": "页", "Page": "页模式", "Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.": "页模式将为每个页面创建一个文档;单文档模式则将所有页面合并为一个文档,以便更好地进行跨页分块。",