feat(ocr): add support for Docling OCR engine and language configuration

This commit adds support for configuring the OCR engine and language(s) for Docling.
Configuration can be set via the environment variables `DOCLING_OCR_ENGINE` and `DOCLING_OCR_LANG`, or through the UI.

Fixes #13133
This commit is contained in:
Athanasios Oikonomou
2025-05-03 00:31:00 +03:00
committed by Athanasios Oikonomou
parent 7d184c3a14
commit 657162e96d
5 changed files with 67 additions and 2 deletions

View File

@@ -100,7 +100,7 @@ class TikaLoader:
headers = {}
if self.kwargs.get("PDF_EXTRACT_IMAGES") == True:
headers['X-Tika-PDFextractInlineImages'] = 'true'
headers["X-Tika-PDFextractInlineImages"] = "true"
endpoint = self.url
if not endpoint.endswith("/"):
@@ -124,10 +124,14 @@ class TikaLoader:
class DoclingLoader:
def __init__(self, url, file_path=None, mime_type=None):
def __init__(
self, url, file_path=None, mime_type=None, ocr_engine=None, ocr_lang=None
):
self.url = url.rstrip("/")
self.file_path = file_path
self.mime_type = mime_type
self.ocr_engine = ocr_engine
self.ocr_lang = ocr_lang
def load(self) -> list[Document]:
with open(self.file_path, "rb") as f:
@@ -144,6 +148,12 @@ class DoclingLoader:
"table_mode": "accurate",
}
if self.ocr_engine and self.ocr_lang:
params["ocr_engine"] = self.ocr_engine
params["ocr_lang"] = [
lang.strip() for lang in self.ocr_lang.split(",") if lang.strip()
]
endpoint = f"{self.url}/v1alpha/convert/file"
r = requests.post(endpoint, files=files, data=params)
@@ -212,6 +222,8 @@ class Loader:
url=self.kwargs.get("DOCLING_SERVER_URL"),
file_path=file_path,
mime_type=file_content_type,
ocr_engine=self.kwargs.get("DOCLING_OCR_ENGINE"),
ocr_lang=self.kwargs.get("DOCLING_OCR_LANG"),
)
elif (
self.engine == "document_intelligence"