feat(ocr): add support for Docling OCR engine and language configuration

This commit adds support for configuring the OCR engine and language(s) for Docling. Configuration can be set via the environment variables `DOCLING_OCR_ENGINE` and `DOCLING_OCR_LANG`, or through the UI. Fixes #13133
2026-04-26 01:25:34 +02:00 · 2025-05-03 00:31:00 +03:00
parent 7d184c3a14
commit 657162e96d
5 changed files with 67 additions and 2 deletions
--- a/backend/open_webui/retrieval/loaders/main.py
+++ b/backend/open_webui/retrieval/loaders/main.py
@@ -100,7 +100,7 @@ class TikaLoader:
            headers = {}

        if self.kwargs.get("PDF_EXTRACT_IMAGES") == True:
-            headers['X-Tika-PDFextractInlineImages'] = 'true'
+            headers["X-Tika-PDFextractInlineImages"] = "true"

        endpoint = self.url
        if not endpoint.endswith("/"):
@@ -124,10 +124,14 @@ class TikaLoader:


 class DoclingLoader:
-    def __init__(self, url, file_path=None, mime_type=None):
+    def __init__(
+        self, url, file_path=None, mime_type=None, ocr_engine=None, ocr_lang=None
+    ):
        self.url = url.rstrip("/")
        self.file_path = file_path
        self.mime_type = mime_type
+        self.ocr_engine = ocr_engine
+        self.ocr_lang = ocr_lang

    def load(self) -> list[Document]:
        with open(self.file_path, "rb") as f:
@@ -144,6 +148,12 @@ class DoclingLoader:
                "table_mode": "accurate",
            }

+            if self.ocr_engine and self.ocr_lang:
+                params["ocr_engine"] = self.ocr_engine
+                params["ocr_lang"] = [
+                    lang.strip() for lang in self.ocr_lang.split(",") if lang.strip()
+                ]
+
            endpoint = f"{self.url}/v1alpha/convert/file"
            r = requests.post(endpoint, files=files, data=params)

@@ -212,6 +222,8 @@ class Loader:
                    url=self.kwargs.get("DOCLING_SERVER_URL"),
                    file_path=file_path,
                    mime_type=file_content_type,
+                    ocr_engine=self.kwargs.get("DOCLING_OCR_ENGINE"),
+                    ocr_lang=self.kwargs.get("DOCLING_OCR_LANG"),
                )
        elif (
            self.engine == "document_intelligence"