Paperless ngx data connector (#4121)

* paperless ngx data connector

* wip resync paperless ngx

* fix generateChunkSource for resyncing paperless ngx

* lint

* Refactor Paperless-NGX connector
Fix issue with date rendering in tooltip + extended width
Move tooltip details to be column for more space

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield
2025-11-20 11:27:38 -08:00
committed by GitHub
parent 2e1c1ff891
commit 05df4ac72b
16 changed files with 521 additions and 12 deletions

View File

@@ -202,7 +202,26 @@ function extensions(app) {
return;
}
);
app.post(
"/ext/paperless-ngx",
[verifyPayloadIntegrity, setDataSigner],
async function (request, response) {
try {
const { loadPaperlessNgx } = require("../utils/extensions/PaperlessNgx");
const result = await loadPaperlessNgx(reqBody(request), response);
response.status(200).json(result);
} catch (e) {
console.error(e);
response.status(400).json({
success: false,
reason: e.message,
data: null,
});
}
return;
}
);
}
module.exports = extensions;

View File

@@ -144,10 +144,40 @@ async function resyncDrupalWiki({ chunkSource }, response) {
}
}
/**
* Fetches the content of a specific Paperless-ngx document via its chunkSource.
* Returns the content as a text string of the document.
* @param {object} data - metadata from document (eg: chunkSource)
* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
*/
async function resyncPaperlessNgx({ chunkSource }, response) {
if (!chunkSource) throw new Error('Invalid source property provided');
try {
const source = response.locals.encryptionWorker.expandPayload(chunkSource);
const { PaperlessNgxLoader } = require("../../utils/extensions/PaperlessNgx/PaperlessNgxLoader");
const loader = new PaperlessNgxLoader({
baseUrl: source.searchParams.get('baseUrl'),
apiToken: source.searchParams.get('token'),
});
const documentId = source.pathname.split('//')[1];
const content = await loader.fetchDocumentContent(documentId);
if (!content) throw new Error('Failed to fetch document content');
response.status(200).json({ success: true, content });
} catch (e) {
console.error(e);
response.status(200).json({
success: false,
content: null,
});
}
}
module.exports = {
link: resyncLink,
youtube: resyncYouTube,
confluence: resyncConfluence,
github: resyncGithub,
drupalwiki: resyncDrupalWiki,
"paperless-ngx": resyncPaperlessNgx,
}

View File

@@ -0,0 +1,128 @@
const { htmlToText } = require("html-to-text");
const pdf = require("pdf-parse");
class PaperlessNgxLoader {
constructor({ baseUrl, apiToken }) {
this.baseUrl = new URL(baseUrl).origin;
this.apiToken = apiToken;
this.baseHeaders = {
Authorization: `Token ${this.apiToken}`,
};
}
async load() {
try {
const documents = await this.fetchAllDocuments();
return documents.map((doc) => this.createDocumentFromPage(doc));
} catch (error) {
console.error("Error:", error);
throw error;
}
}
/**
* Fetches all documents from Paperless-ngx
* @returns {Promise<{{[key: string]: any, content: string}[]}>} The documents with their content
*/
async fetchAllDocuments() {
try {
const documents = await fetch(`${this.baseUrl}/api/documents/`, {
headers: {
"Content-Type": "application/json",
...this.baseHeaders,
},
})
.then((res) => res.json())
.then((data) => data.results || [])
.catch((error) => {
throw new Error(
`Failed to fetch documents from Paperless-ngx: ${error.message}`
);
});
const documentsWithContent = await Promise.all(
documents.map(async (doc) => {
const content = await this.fetchDocumentContent(doc.id);
return { ...doc, content };
})
);
return documentsWithContent.filter((doc) => !!doc.content);
} catch (error) {
throw new Error(
`Failed to fetch documents from Paperless-ngx: ${error.message}`
);
}
}
/**
* Fetches the content of a document from Paperless-ngx
* @param {string} documentId - The ID of the document to fetch
* @returns {Promise<string>} The content of the document
*/
async fetchDocumentContent(documentId) {
try {
const response = await fetch(
`${this.baseUrl}/api/documents/${documentId}/download/`,
{
headers: this.baseHeaders,
}
);
if (!response.ok)
throw new Error(`Failed to fetch document content: ${response.status}`);
const contentType = response.headers.get("content-type");
switch (contentType) {
case "text/plain":
return await response.text();
case "application/pdf":
const buffer = await response.arrayBuffer();
return await this.parsePdfContent(buffer);
default:
return await response.text();
}
} catch (error) {
console.error(
`Failed to fetch content for document ${documentId}:`,
error
);
return "";
}
}
async parsePdfContent(buffer) {
try {
const data = await pdf(Buffer.from(buffer));
return data.text;
} catch (error) {
console.error("Failed to parse PDF content:", error);
return "";
}
}
createDocumentFromPage(doc) {
const content = doc.content || "";
const plainTextContent = htmlToText(content, {
wordwrap: false,
preserveNewlines: true,
});
return {
pageContent: plainTextContent,
metadata: {
id: doc.id,
title: doc.original_file_name,
created: doc.created,
modified: doc.modified,
added: doc.added,
tags: doc.tags,
correspondent: doc.correspondent,
documentType: doc.document_type,
url: `${this.baseUrl}/documents/${doc.id}`,
},
};
}
}
module.exports = PaperlessNgxLoader;

View File

@@ -0,0 +1,128 @@
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const {
writeToServerDocuments,
sanitizeFileName,
documentsFolder,
} = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const { validBaseUrl } = require("../../http");
const PaperlessNgxLoader = require("./PaperlessNgxLoader");
/**
* Load documents from a Paperless-ngx instance
* @param {object} args - forwarded request body params
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns
*/
async function loadPaperlessNgx({ baseUrl = null, apiToken = null }, response) {
if (!baseUrl || !validBaseUrl(baseUrl)) {
return {
success: false,
reason: "Provided base URL is not a valid URL.",
};
}
if (!apiToken) {
return {
success: false,
reason:
"You need to provide an API token to use the Paperless-ngx connector.",
};
}
const { origin, hostname } = new URL(baseUrl);
console.log(`-- Working Paperless-ngx ${origin} --`);
const loader = new PaperlessNgxLoader({
baseUrl: origin,
apiToken,
});
const { docs, error } = await loader
.load()
.then((docs) => ({ docs, error: null }))
.catch((e) => ({
docs: [],
error: e.message?.split("Error:")?.[1] || e.message,
}));
if (!docs.length || !!error) {
return {
success: false,
reason:
error ?? "No parseable documents found in that Paperless-ngx instance.",
data: null,
};
}
const outFolder = slugify(
`paperless-${hostname}-${v4().slice(0, 4)}`
).toLowerCase();
const outFolderPath = path.resolve(documentsFolder, outFolder);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });
docs.forEach((doc) => {
if (!doc.pageContent) return;
const data = {
id: v4(),
url: doc.metadata.url,
title: doc.metadata.title,
docAuthor: doc.metadata.correspondent || "Unknown",
description: `A document from the Paperless-ngx instance at ${origin}`,
docSource: `paperless-ngx`,
chunkSource: generateChunkSource(
{ doc, baseUrl: origin, apiToken },
response.locals.encryptionWorker
),
published: doc.metadata.created,
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent),
};
console.log(
`[Paperless-ngx Loader]: Saving ${doc.metadata.title} to ${outFolder}`
);
const fileName = sanitizeFileName(
`${slugify(doc.metadata.title)}-${data.id}`
);
writeToServerDocuments({
data,
filename: fileName,
destinationOverride: outFolderPath,
});
});
return {
success: true,
reason: null,
data: {
files: docs.length,
destination: outFolder,
},
};
}
/**
* Generate the full chunkSource for a specific Paperless-ngx document so that we can resync it later.
* @param {object} chunkSourceInformation
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
* @returns {string}
*/
function generateChunkSource({ doc, baseUrl, apiToken }, encryptionWorker) {
const payload = {
baseUrl,
token: apiToken,
};
return `paperless-ngx://${doc.metadata.id}?payload=${encryptionWorker.encrypt(
JSON.stringify(payload)
)}`;
}
module.exports = {
loadPaperlessNgx,
};

View File

@@ -5,6 +5,7 @@ import Link from "./link.svg";
import Confluence from "./confluence.jpeg";
import DrupalWiki from "./drupalwiki.jpg";
import Obsidian from "./obsidian.png";
import PaperlessNgx from "./paperless-ngx.jpeg";
const ConnectorImages = {
github: GitHub,
@@ -14,6 +15,7 @@ const ConnectorImages = {
confluence: Confluence,
drupalwiki: DrupalWiki,
obsidian: Obsidian,
paperlessNgx: PaperlessNgx,
};
export default ConnectorImages;

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.6 KiB

View File

@@ -0,0 +1,124 @@
import React, { useState } from "react";
import System from "@/models/system";
import showToast from "@/utils/toast";
import { Info } from "@phosphor-icons/react";
export default function PaperlessNgxOptions() {
const [loading, setLoading] = useState(false);
const handleSubmit = async (e) => {
e.preventDefault();
const form = new FormData(e.target);
try {
setLoading(true);
showToast(
"Fetching documents from Paperless-ngx - this may take a while.",
"info",
{ clear: true, autoClose: false }
);
const { data, error } = await System.dataConnectors.paperlessNgx.collect({
baseUrl: form.get("baseUrl"),
apiToken: form.get("apiToken"),
});
if (!!error) {
showToast(error, "error", { clear: true });
setLoading(false);
return;
}
showToast(
`Successfully imported ${data.files} documents from Paperless-ngx. Output folder is ${data.destination}.`,
"success",
{ clear: true }
);
e.target.reset();
setLoading(false);
} catch (e) {
console.error(e);
showToast(e.message, "error", { clear: true });
setLoading(false);
}
};
return (
<div className="flex w-full">
<div className="flex flex-col w-full px-1 md:pb-6 pb-16">
<form className="w-full" onSubmit={handleSubmit}>
<div className="w-full flex flex-col py-2">
<div className="w-full flex flex-col gap-4">
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Base URL
</label>
<p className="text-xs font-normal text-theme-text-secondary">
The URL where your Paperless-ngx instance is running (e.g.,
http://localhost:8000)
</p>
</div>
<input
type="url"
name="baseUrl"
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="http://localhost:8000"
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold flex gap-x-2 items-center">
<p className="font-bold text-white">API Token</p>
</label>
<p className="text-xs font-normal text-theme-text-secondary">
Your Paperless-ngx API token. You can find this under
&apos;My Profile&apos; and then &apos;API Auth Token&apos;.
</p>
</div>
<input
type="password"
name="apiToken"
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="Enter your API token"
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
</div>
</div>
<div className="flex flex-col gap-y-2 w-full pr-10">
<div className="flex flex-col md:flex-row md:items-center gap-x-2 text-white mb-4 bg-blue-800/30 w-fit rounded-lg px-4 py-2">
<div className="gap-x-2 flex items-center">
<Info className="shrink-0" size={25} />
<p className="text-sm">
Make sure your Paperless-ngx instance is running and
accessible from this machine.
</p>
</div>
</div>
<button
type="submit"
disabled={loading}
className="mt-2 w-full justify-center border-none px-4 py-2 rounded-lg text-dark-text light:text-white text-sm font-bold items-center flex gap-x-2 bg-theme-home-button-primary hover:bg-theme-home-button-primary-hover disabled:bg-theme-home-button-primary-hover disabled:cursor-not-allowed"
>
{loading ? "Importing documents..." : "Submit"}
</button>
{loading && (
<p className="text-xs text-white/50">
Once complete, all documents will be available for embedding
into workspaces.
</p>
)}
</div>
</form>
</div>
</div>
);
}

View File

@@ -10,6 +10,7 @@ import { useState } from "react";
import ConnectorOption from "./ConnectorOption";
import WebsiteDepthOptions from "./Connectors/WebsiteDepth";
import ObsidianOptions from "./Connectors/Obsidian";
import PaperlessNgxOptions from "./Connectors/PaperlessNgx";
export const getDataConnectors = (t) => ({
github: {
@@ -54,6 +55,12 @@ export const getDataConnectors = (t) => ({
description: "Import Obsidian vault in a single click.",
options: <ObsidianOptions />,
},
"paperless-ngx": {
name: "Paperless-ngx",
image: ConnectorImages.paperlessNgx,
description: "Import documents from your Paperless-ngx instance.",
options: <PaperlessNgxOptions />,
},
});
export default function DataConnectors() {

View File

@@ -1,6 +1,6 @@
import React from "react";
import {
formatDate,
formatDateTimeAsMoment,
getFileExtension,
middleTruncate,
} from "@/utils/directories";
@@ -15,12 +15,12 @@ export default function FileRow({ item, selected, toggleSelection }) {
}`}
>
<div
data-tooltip-id={`directory-item`}
data-tooltip-id="directory-item"
className="col-span-10 w-fit flex gap-x-[4px] items-center relative"
data-tooltip-content={JSON.stringify({
title: item.title,
date: formatDate(item?.published),
extension: getFileExtension(item.url).toUpperCase(),
date: formatDateTimeAsMoment(item?.published),
extension: getFileExtension(item.url),
})}
>
<div

View File

@@ -350,7 +350,7 @@ function DirectoryTooltips() {
id="directory-item"
place="bottom"
delayShow={800}
className="tooltip invert light:invert-0 z-99 max-w-[200px]"
className="tooltip invert light:invert-0 z-99 max-w-[300px]"
render={({ content }) => {
const data = safeJsonParse(content, null);
if (!data) return null;
@@ -359,7 +359,7 @@ function DirectoryTooltips() {
<p className="text-white light:invert font-medium break-all">
{data.title}
</p>
<div className="flex mt-1 gap-x-2">
<div className="flex flex-col mt-1">
<p className="">
Date: <b>{data.date}</b>
</p>

View File

@@ -1,6 +1,6 @@
import { memo, useState } from "react";
import {
formatDate,
formatDateTimeAsMoment,
getFileExtension,
middleTruncate,
} from "@/utils/directories";
@@ -70,8 +70,8 @@ export default function WorkspaceFileRow({
data-tooltip-id="ws-directory-item"
data-tooltip-content={JSON.stringify({
title: item.title,
date: formatDate(item?.published),
extension: getFileExtension(item.url).toUpperCase(),
date: formatDateTimeAsMoment(item?.published),
extension: getFileExtension(item.url),
})}
>
<div className="shrink-0 w-3 h-3">

View File

@@ -18,6 +18,7 @@ import {
import ConfluenceLogo from "@/media/dataConnectors/confluence.png";
import DrupalWikiLogo from "@/media/dataConnectors/drupalwiki.png";
import ObsidianLogo from "@/media/dataConnectors/obsidian.png";
import PaperlessNgxLogo from "@/media/dataConnectors/paperlessngx.png";
import { toPercentString } from "@/utils/numbers";
import { useTranslation } from "react-i18next";
import pluralize from "pluralize";
@@ -216,6 +217,7 @@ const supportedSources = [
"drupalwiki://",
"youtube://",
"obsidian://",
"paperless-ngx://",
];
/**
@@ -291,6 +293,11 @@ function parseChunkSource({ title = "", chunks = [] }) {
icon = "obsidian";
break;
case "paperless-ngx://":
text = title;
icon = "paperlessNgx";
break;
default:
text = url.host + url.pathname;
icon = "link";
@@ -318,7 +325,15 @@ const DrupalWikiIcon = ({ size = 16, ...props }) => (
const ObsidianIcon = ({ size = 16, ...props }) => (
<img src={ObsidianLogo} {...props} width={size} height={size} />
);
const PaperlessNgxIcon = ({ size = 16, ...props }) => (
<img
src={PaperlessNgxLogo}
{...props}
width={size}
height={size}
className="rounded-sm bg-white"
/>
);
const ICONS = {
file: FileText,
link: LinkSimple,
@@ -328,4 +343,5 @@ const ICONS = {
confluence: ConfluenceIcon,
drupalwiki: DrupalWikiIcon,
obsidian: ObsidianIcon,
paperlessNgx: PaperlessNgxIcon,
};

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

View File

@@ -207,6 +207,25 @@ const DataConnector = {
});
},
},
paperlessNgx: {
collect: async function ({ baseUrl, apiToken }) {
return await fetch(`${API_BASE}/ext/paperless-ngx`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({ baseUrl, apiToken }),
})
.then((res) => res.json())
.then((res) => {
if (!res.success) throw new Error(res.reason);
return { data: res.data, error: null };
})
.catch((e) => {
console.error(e);
return { data: null, error: e.message };
});
},
},
};
export default DataConnector;

View File

@@ -1,3 +1,5 @@
import moment from "moment";
export function formatDate(dateString) {
const date = isNaN(new Date(dateString).getTime())
? new Date()
@@ -7,8 +9,20 @@ export function formatDate(dateString) {
return formattedDate;
}
export function formatDateTimeAsMoment(dateString, format = "LLL") {
if (!dateString) return moment().format(format);
try {
return moment(dateString).format(format);
} catch (error) {
return moment().format(format);
}
}
export function getFileExtension(path) {
return path?.split(".")?.slice(-1)?.[0] || "file";
const hasExtension = path?.includes(".");
if (!hasExtension) return "FILE";
const extension = path?.split(".")?.slice(-1)?.[0];
return extension?.toUpperCase() || "FILE";
}
export function middleTruncate(str, n) {

View File

@@ -170,6 +170,28 @@ function extensionEndpoints(app) {
}
}
);
app.post(
"/ext/paperless-ngx",
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
async (request, response) => {
try {
const responseFromProcessor =
await new CollectorApi().forwardExtensionRequest({
endpoint: "/ext/paperless-ngx",
method: "POST",
body: request.body,
});
await Telemetry.sendTelemetry("extension_invoked", {
type: "paperless_ngx",
});
response.status(200).json(responseFromProcessor);
} catch (e) {
console.error(e);
response.sendStatus(500).end();
}
}
);
}
module.exports = { extensionEndpoints };