mirror of
https://github.com/open-webui/open-webui.git
synced 2026-04-26 01:25:34 +02:00
refac/breaking: docling params
This commit is contained in:
@@ -132,8 +132,9 @@ class TikaLoader:
|
||||
|
||||
|
||||
class DoclingLoader:
|
||||
def __init__(self, url, file_path=None, mime_type=None, params=None):
|
||||
def __init__(self, url, api_key=None, file_path=None, mime_type=None, params=None):
|
||||
self.url = url.rstrip("/")
|
||||
self.api_key = api_key
|
||||
self.file_path = file_path
|
||||
self.mime_type = mime_type
|
||||
|
||||
@@ -141,6 +142,10 @@ class DoclingLoader:
|
||||
|
||||
def load(self) -> list[Document]:
|
||||
with open(self.file_path, "rb") as f:
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
files = {
|
||||
"files": (
|
||||
self.file_path,
|
||||
@@ -149,60 +154,15 @@ class DoclingLoader:
|
||||
)
|
||||
}
|
||||
|
||||
params = {"image_export_mode": "placeholder"}
|
||||
|
||||
if self.params:
|
||||
if self.params.get("do_picture_description"):
|
||||
params["do_picture_description"] = self.params.get(
|
||||
"do_picture_description"
|
||||
)
|
||||
|
||||
picture_description_mode = self.params.get(
|
||||
"picture_description_mode", ""
|
||||
).lower()
|
||||
|
||||
if picture_description_mode == "local" and self.params.get(
|
||||
"picture_description_local", {}
|
||||
):
|
||||
params["picture_description_local"] = json.dumps(
|
||||
self.params.get("picture_description_local", {})
|
||||
)
|
||||
|
||||
elif picture_description_mode == "api" and self.params.get(
|
||||
"picture_description_api", {}
|
||||
):
|
||||
params["picture_description_api"] = json.dumps(
|
||||
self.params.get("picture_description_api", {})
|
||||
)
|
||||
|
||||
params["do_ocr"] = self.params.get("do_ocr")
|
||||
|
||||
params["force_ocr"] = self.params.get("force_ocr")
|
||||
|
||||
if (
|
||||
self.params.get("do_ocr")
|
||||
and self.params.get("ocr_engine")
|
||||
and self.params.get("ocr_lang")
|
||||
):
|
||||
params["ocr_engine"] = self.params.get("ocr_engine")
|
||||
params["ocr_lang"] = [
|
||||
lang.strip()
|
||||
for lang in self.params.get("ocr_lang").split(",")
|
||||
if lang.strip()
|
||||
]
|
||||
|
||||
if self.params.get("pdf_backend"):
|
||||
params["pdf_backend"] = self.params.get("pdf_backend")
|
||||
|
||||
if self.params.get("table_mode"):
|
||||
params["table_mode"] = self.params.get("table_mode")
|
||||
|
||||
if self.params.get("pipeline"):
|
||||
params["pipeline"] = self.params.get("pipeline")
|
||||
|
||||
endpoint = f"{self.url}/v1/convert/file"
|
||||
r = requests.post(endpoint, files=files, data=params)
|
||||
|
||||
r = requests.post(
|
||||
f"{self.url}/v1/convert/file",
|
||||
files=files,
|
||||
data={
|
||||
"image_export_mode": "placeholder",
|
||||
**self.params,
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
if r.ok:
|
||||
result = r.json()
|
||||
document_data = result.get("document", {})
|
||||
@@ -211,7 +171,6 @@ class DoclingLoader:
|
||||
metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
|
||||
|
||||
log.debug("Docling extracted text: %s", text)
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
else:
|
||||
error_msg = f"Error calling Docling API: {r.reason}"
|
||||
@@ -340,6 +299,7 @@ class Loader:
|
||||
|
||||
loader = DoclingLoader(
|
||||
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||
api_key=self.kwargs.get("DOCLING_API_KEY", None),
|
||||
file_path=file_path,
|
||||
mime_type=file_content_type,
|
||||
params=params,
|
||||
|
||||
Reference in New Issue
Block a user