mirror of
https://github.com/Mintplex-Labs/anything-llm
synced 2026-04-25 17:15:37 +02:00
Feature/drupalwiki collector (#3693)
* Implement DrupalWiki collector * Add attachment downloading and processing functionality (#3) * linting * Linting Add citation image small refactors add URL for citation identifier --------- Co-authored-by: em <eugen.mayer@kontextwork.de> Co-authored-by: rexjohannes <53578137+rexjohannes@users.noreply.github.com> Co-authored-by: Eugen Mayer <136934+EugenMayer@users.noreply.github.com>
This commit is contained in:
@@ -154,6 +154,32 @@ function extensions(app) {
|
||||
return;
|
||||
}
|
||||
);
|
||||
|
||||
app.post(
|
||||
"/ext/drupalwiki",
|
||||
[verifyPayloadIntegrity, setDataSigner],
|
||||
async function (request, response) {
|
||||
try {
|
||||
const { loadAndStoreSpaces } = require("../utils/extensions/DrupalWiki");
|
||||
const { success, reason, data } = await loadAndStoreSpaces(
|
||||
reqBody(request),
|
||||
response
|
||||
);
|
||||
response.status(200).json({ success, reason, data });
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
response.status(400).json({
|
||||
success: false,
|
||||
reason: e.message,
|
||||
data: {
|
||||
title: null,
|
||||
author: null,
|
||||
},
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
module.exports = extensions;
|
||||
|
||||
@@ -2,7 +2,7 @@ const { getLinkText } = require("../../processLink");
|
||||
|
||||
/**
|
||||
* Fetches the content of a raw link. Returns the content as a text string of the link in question.
|
||||
* @param {object} data - metadata from document (eg: link)
|
||||
* @param {object} data - metadata from document (eg: link)
|
||||
* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
|
||||
*/
|
||||
async function resyncLink({ link }, response) {
|
||||
@@ -24,7 +24,7 @@ async function resyncLink({ link }, response) {
|
||||
* Fetches the content of a YouTube link. Returns the content as a text string of the video in question.
|
||||
* We offer this as there may be some videos where a transcription could be manually edited after initial scraping
|
||||
* but in general - transcriptions often never change.
|
||||
* @param {object} data - metadata from document (eg: link)
|
||||
* @param {object} data - metadata from document (eg: link)
|
||||
* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
|
||||
*/
|
||||
async function resyncYouTube({ link }, response) {
|
||||
@@ -44,9 +44,9 @@ async function resyncYouTube({ link }, response) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the content of a specific confluence page via its chunkSource.
|
||||
* Fetches the content of a specific confluence page via its chunkSource.
|
||||
* Returns the content as a text string of the page in question and only that page.
|
||||
* @param {object} data - metadata from document (eg: chunkSource)
|
||||
* @param {object} data - metadata from document (eg: chunkSource)
|
||||
* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
|
||||
*/
|
||||
async function resyncConfluence({ chunkSource }, response) {
|
||||
@@ -76,9 +76,9 @@ async function resyncConfluence({ chunkSource }, response) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the content of a specific confluence page via its chunkSource.
|
||||
* Fetches the content of a specific confluence page via its chunkSource.
|
||||
* Returns the content as a text string of the page in question and only that page.
|
||||
* @param {object} data - metadata from document (eg: chunkSource)
|
||||
* @param {object} data - metadata from document (eg: chunkSource)
|
||||
* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
|
||||
*/
|
||||
async function resyncGithub({ chunkSource }, response) {
|
||||
@@ -106,9 +106,48 @@ async function resyncGithub({ chunkSource }, response) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Fetches the content of a specific DrupalWiki page via its chunkSource.
|
||||
* Returns the content as a text string of the page in question and only that page.
|
||||
* @param {object} data - metadata from document (eg: chunkSource)
|
||||
* @param {import("../../middleware/setDataSigner").ResponseWithSigner} response
|
||||
*/
|
||||
async function resyncDrupalWiki({ chunkSource }, response) {
|
||||
if (!chunkSource) throw new Error('Invalid source property provided');
|
||||
try {
|
||||
// DrupalWiki data is `payload` encrypted. So we need to expand its
|
||||
// encrypted payload back into query params so we can reFetch the page with same access token/params.
|
||||
const source = response.locals.encryptionWorker.expandPayload(chunkSource);
|
||||
const { loadPage } = require("../../utils/extensions/DrupalWiki");
|
||||
const { success, reason, content } = await loadPage({
|
||||
baseUrl: source.searchParams.get('baseUrl'),
|
||||
pageId: source.searchParams.get('pageId'),
|
||||
accessToken: source.searchParams.get('accessToken'),
|
||||
});
|
||||
|
||||
if (!success) {
|
||||
console.error(`Failed to sync DrupalWiki page content. ${reason}`);
|
||||
response.status(200).json({
|
||||
success: false,
|
||||
content: null,
|
||||
});
|
||||
} else {
|
||||
response.status(200).json({ success, content });
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
response.status(200).json({
|
||||
success: false,
|
||||
content: null,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
link: resyncLink,
|
||||
youtube: resyncYouTube,
|
||||
confluence: resyncConfluence,
|
||||
github: resyncGithub,
|
||||
}
|
||||
drupalwiki: resyncDrupalWiki,
|
||||
}
|
||||
|
||||
320
collector/utils/extensions/DrupalWiki/DrupalWiki/index.js
Normal file
320
collector/utils/extensions/DrupalWiki/DrupalWiki/index.js
Normal file
@@ -0,0 +1,320 @@
|
||||
/**
|
||||
* Copyright 2024
|
||||
*
|
||||
* Authors:
|
||||
* - Eugen Mayer (KontextWork)
|
||||
*/
|
||||
|
||||
const { htmlToText } = require("html-to-text");
|
||||
const { tokenizeString } = require("../../../tokenizer");
|
||||
const { sanitizeFileName, writeToServerDocuments } = require("../../../files");
|
||||
const { default: slugify } = require("slugify");
|
||||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
const { processSingleFile } = require("../../../../processSingleFile");
|
||||
const {
|
||||
WATCH_DIRECTORY,
|
||||
SUPPORTED_FILETYPE_CONVERTERS,
|
||||
} = require("../../../constants");
|
||||
|
||||
class Page {
|
||||
/**
|
||||
*
|
||||
* @param {number }id
|
||||
* @param {string }title
|
||||
* @param {string} created
|
||||
* @param {string} type
|
||||
* @param {string} processedBody
|
||||
* @param {string} url
|
||||
* @param {number} spaceId
|
||||
*/
|
||||
constructor({ id, title, created, type, processedBody, url, spaceId }) {
|
||||
this.id = id;
|
||||
this.title = title;
|
||||
this.url = url;
|
||||
this.created = created;
|
||||
this.type = type;
|
||||
this.processedBody = processedBody;
|
||||
this.spaceId = spaceId;
|
||||
}
|
||||
}
|
||||
|
||||
class DrupalWiki {
|
||||
/**
|
||||
*
|
||||
* @param baseUrl
|
||||
* @param spaceId
|
||||
* @param accessToken
|
||||
*/
|
||||
constructor({ baseUrl, accessToken }) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.accessToken = accessToken;
|
||||
this.storagePath = this.#prepareStoragePath(baseUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load all pages for the given space, fetching storing each page one by one
|
||||
* to minimize the memory usage
|
||||
*
|
||||
* @param {number} spaceId
|
||||
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async loadAndStoreAllPagesForSpace(spaceId, encryptionWorker) {
|
||||
const pageIndex = await this.#getPageIndexForSpace(spaceId);
|
||||
for (const pageId of pageIndex) {
|
||||
try {
|
||||
const page = await this.loadPage(pageId);
|
||||
|
||||
// Pages with an empty body will lead to embedding issues / exceptions
|
||||
if (page.processedBody.trim() !== "") {
|
||||
this.#storePage(page, encryptionWorker);
|
||||
await this.#downloadAndProcessAttachments(page.id);
|
||||
} else {
|
||||
console.log(`Skipping page (${page.id}) since it has no content`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(
|
||||
`Could not process DrupalWiki page ${pageId} (skipping and continuing): `
|
||||
);
|
||||
console.error(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {number} pageId
|
||||
* @returns {Promise<Page>}
|
||||
*/
|
||||
async loadPage(pageId) {
|
||||
return this.#fetchPage(pageId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the page ids for the configured space
|
||||
* @param {number} spaceId
|
||||
* @returns{Promise<number[]>} array of pageIds
|
||||
*/
|
||||
async #getPageIndexForSpace(spaceId) {
|
||||
// errors on fetching the pageIndex is fatal, no error handling
|
||||
let hasNext = true;
|
||||
let pageIds = [];
|
||||
let pageNr = 0;
|
||||
do {
|
||||
let { isLast, pageIdsForPage } = await this.#getPagesForSpacePaginated(
|
||||
spaceId,
|
||||
pageNr
|
||||
);
|
||||
hasNext = !isLast;
|
||||
pageNr++;
|
||||
if (pageIdsForPage.length) {
|
||||
pageIds = pageIds.concat(pageIdsForPage);
|
||||
}
|
||||
} while (hasNext);
|
||||
|
||||
return pageIds;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {number} pageNr
|
||||
* @param {number} spaceId
|
||||
* @returns {Promise<{isLast,pageIds}>}
|
||||
*/
|
||||
async #getPagesForSpacePaginated(spaceId, pageNr) {
|
||||
/*
|
||||
* {
|
||||
* content: Page[],
|
||||
* last: boolean,
|
||||
* pageable: {
|
||||
* pageNumber: number
|
||||
* }
|
||||
* }
|
||||
*/
|
||||
const data = await this._doFetch(
|
||||
`${this.baseUrl}/api/rest/scope/api/page?size=100&space=${spaceId}&page=${pageNr}`
|
||||
);
|
||||
|
||||
const pageIds = data.content.map((page) => {
|
||||
return Number(page.id);
|
||||
});
|
||||
|
||||
return {
|
||||
isLast: data.last,
|
||||
pageIdsForPage: pageIds,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pageId
|
||||
* @returns {Promise<Page>}
|
||||
*/
|
||||
async #fetchPage(pageId) {
|
||||
const data = await this._doFetch(
|
||||
`${this.baseUrl}/api/rest/scope/api/page/${pageId}`
|
||||
);
|
||||
const url = `${this.baseUrl}/node/${data.id}`;
|
||||
return new Page({
|
||||
id: data.id,
|
||||
title: data.title,
|
||||
created: data.lastModified,
|
||||
type: data.type,
|
||||
processedBody: this.#processPageBody({
|
||||
body: data.body,
|
||||
title: data.title,
|
||||
lastModified: data.lastModified,
|
||||
url: url,
|
||||
}),
|
||||
url: url,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Page} page
|
||||
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
|
||||
*/
|
||||
#storePage(page, encryptionWorker) {
|
||||
const { hostname } = new URL(this.baseUrl);
|
||||
|
||||
// This UUID will ensure that re-importing the same page without any changes will not
|
||||
// show up (deduplication).
|
||||
const targetUUID = `${hostname}.${page.spaceId}.${page.id}.${page.created}`;
|
||||
const wordCount = page.processedBody.split(" ").length;
|
||||
const tokenCount =
|
||||
page.processedBody.length > 0
|
||||
? tokenizeString(page.processedBody).length
|
||||
: 0;
|
||||
const data = {
|
||||
id: targetUUID,
|
||||
url: `drupalwiki://${page.url}`,
|
||||
title: page.title,
|
||||
docAuthor: this.baseUrl,
|
||||
description: page.title,
|
||||
docSource: `${this.baseUrl} DrupalWiki`,
|
||||
chunkSource: this.#generateChunkSource(page.id, encryptionWorker),
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: wordCount,
|
||||
pageContent: page.processedBody,
|
||||
token_count_estimate: tokenCount,
|
||||
};
|
||||
|
||||
const fileName = sanitizeFileName(`${slugify(page.title)}-${data.id}`);
|
||||
console.log(
|
||||
`[DrupalWiki Loader]: Saving page '${page.title}' (${page.id}) to '${this.storagePath}/${fileName}'`
|
||||
);
|
||||
writeToServerDocuments(data, fileName, this.storagePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the full chunkSource for a specific Confluence page so that we can resync it later.
|
||||
* This data is encrypted into a single `payload` query param so we can replay credentials later
|
||||
* since this was encrypted with the systems persistent password and salt.
|
||||
* @param {number} pageId
|
||||
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
|
||||
* @returns {string}
|
||||
*/
|
||||
#generateChunkSource(pageId, encryptionWorker) {
|
||||
const payload = {
|
||||
baseUrl: this.baseUrl,
|
||||
pageId: pageId,
|
||||
accessToken: this.accessToken,
|
||||
};
|
||||
return `drupalwiki://${this.baseUrl}?payload=${encryptionWorker.encrypt(
|
||||
JSON.stringify(payload)
|
||||
)}`;
|
||||
}
|
||||
|
||||
async _doFetch(url) {
|
||||
const response = await fetch(url, {
|
||||
headers: this.#getHeaders(),
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch ${url}: ${response.status}`);
|
||||
}
|
||||
return response.json();
|
||||
}
|
||||
|
||||
#getHeaders() {
|
||||
return {
|
||||
"Content-Type": "application/json",
|
||||
Accept: "application/json",
|
||||
Authorization: `Bearer ${this.accessToken}`,
|
||||
};
|
||||
}
|
||||
|
||||
#prepareStoragePath(baseUrl) {
|
||||
const { hostname } = new URL(baseUrl);
|
||||
const subFolder = slugify(`drupalwiki-${hostname}`).toLowerCase();
|
||||
|
||||
const outFolder =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(
|
||||
__dirname,
|
||||
`../../../../server/storage/documents/${subFolder}`
|
||||
)
|
||||
: path.resolve(process.env.STORAGE_DIR, `documents/${subFolder}`);
|
||||
|
||||
if (!fs.existsSync(outFolder)) {
|
||||
fs.mkdirSync(outFolder, { recursive: true });
|
||||
}
|
||||
return outFolder;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} body
|
||||
* @param {string} url
|
||||
* @param {string} title
|
||||
* @param {string} lastModified
|
||||
* @returns {string}
|
||||
* @private
|
||||
*/
|
||||
#processPageBody({ body, url, title, lastModified }) {
|
||||
// use the title as content if there is none
|
||||
const textContent = body.trim() !== "" ? body : title;
|
||||
|
||||
const plainTextContent = htmlToText(textContent, {
|
||||
wordwrap: false,
|
||||
preserveNewlines: true,
|
||||
});
|
||||
// preserve structure
|
||||
const plainBody = plainTextContent.replace(/\n{3,}/g, "\n\n");
|
||||
// add the link to the document
|
||||
return `Link/URL: ${url}\n\n${plainBody}`;
|
||||
}
|
||||
|
||||
async #downloadAndProcessAttachments(pageId) {
|
||||
try {
|
||||
const data = await this._doFetch(
|
||||
`${this.baseUrl}/api/rest/scope/api/attachment?pageId=${pageId}&size=2000`
|
||||
);
|
||||
|
||||
const extensionsList = Object.keys(SUPPORTED_FILETYPE_CONVERTERS);
|
||||
for (const attachment of data.content || data) {
|
||||
const { fileName, id: attachId } = attachment;
|
||||
const lowerName = fileName.toLowerCase();
|
||||
if (!extensionsList.some((ext) => lowerName.endsWith(ext))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const downloadUrl = `${this.baseUrl}/api/rest/scope/api/attachment/${attachId}/download`;
|
||||
const attachmentResponse = await fetch(downloadUrl, {
|
||||
headers: this.#getHeaders(),
|
||||
});
|
||||
if (!attachmentResponse.ok) {
|
||||
console.log(`Skipping attachment: ${fileName} - Download failed`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const buffer = await attachmentResponse.arrayBuffer();
|
||||
const localFilePath = `${WATCH_DIRECTORY}/${fileName}`;
|
||||
require("fs").writeFileSync(localFilePath, Buffer.from(buffer));
|
||||
|
||||
await processSingleFile(fileName);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Fetching/processing attachments failed:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { DrupalWiki };
|
||||
102
collector/utils/extensions/DrupalWiki/index.js
Normal file
102
collector/utils/extensions/DrupalWiki/index.js
Normal file
@@ -0,0 +1,102 @@
|
||||
/**
|
||||
* Copyright 2024
|
||||
*
|
||||
* Authors:
|
||||
* - Eugen Mayer (KontextWork)
|
||||
*/
|
||||
|
||||
const { DrupalWiki } = require("./DrupalWiki");
|
||||
const { validBaseUrl } = require("../../../utils/http");
|
||||
|
||||
async function loadAndStoreSpaces(
|
||||
{ baseUrl = null, spaceIds = null, accessToken = null },
|
||||
response
|
||||
) {
|
||||
if (!baseUrl) {
|
||||
return {
|
||||
success: false,
|
||||
reason:
|
||||
"Please provide your baseUrl like https://mywiki.drupal-wiki.net.",
|
||||
};
|
||||
} else if (!validBaseUrl(baseUrl)) {
|
||||
return {
|
||||
success: false,
|
||||
reason: "Provided base URL is not a valid URL.",
|
||||
};
|
||||
}
|
||||
|
||||
if (!spaceIds) {
|
||||
return {
|
||||
success: false,
|
||||
reason:
|
||||
"Please provide a list of spaceIds like 21,56,67 you want to extract",
|
||||
};
|
||||
}
|
||||
|
||||
if (!accessToken) {
|
||||
return {
|
||||
success: false,
|
||||
reason: "Please provide a REST API-Token.",
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`-- Working Drupal Wiki ${baseUrl} for spaceIds: ${spaceIds} --`);
|
||||
const drupalWiki = new DrupalWiki({ baseUrl, accessToken });
|
||||
|
||||
const encryptionWorker = response.locals.encryptionWorker;
|
||||
const spaceIdsArr = spaceIds.split(",").map((idStr) => {
|
||||
return Number(idStr.trim());
|
||||
});
|
||||
|
||||
for (const spaceId of spaceIdsArr) {
|
||||
try {
|
||||
await drupalWiki.loadAndStoreAllPagesForSpace(spaceId, encryptionWorker);
|
||||
console.log(`--- Finished space ${spaceId} ---`);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
return {
|
||||
success: false,
|
||||
reason: e.message,
|
||||
data: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
console.log(`-- Finished all spaces--`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
reason: null,
|
||||
data: {
|
||||
spaceIds,
|
||||
destination: drupalWiki.storagePath,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the page content from a specific Confluence page, not all pages in a workspace.
|
||||
* @returns
|
||||
*/
|
||||
async function loadPage({ baseUrl, pageId, accessToken }) {
|
||||
console.log(`-- Working Drupal Wiki Page ${pageId} of ${baseUrl} --`);
|
||||
const drupalWiki = new DrupalWiki({ baseUrl, accessToken });
|
||||
try {
|
||||
const page = await drupalWiki.loadPage(pageId);
|
||||
return {
|
||||
success: true,
|
||||
reason: null,
|
||||
content: page.processedBody,
|
||||
};
|
||||
} catch (e) {
|
||||
return {
|
||||
success: false,
|
||||
reason: `Failed (re)-fetching DrupalWiki page ${pageId} form ${baseUrl}}`,
|
||||
content: null,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
loadAndStoreSpaces,
|
||||
loadPage,
|
||||
};
|
||||
@@ -12,7 +12,24 @@ function queryParams(request) {
|
||||
return request.query;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates if the provided baseUrl is a valid URL at all.
|
||||
* - Does not validate if the URL is reachable or accessible.
|
||||
* - Does not do any further validation of the URL like `validURL` in `utils/url/index.js`
|
||||
* @param {string} baseUrl
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function validBaseUrl(baseUrl) {
|
||||
try {
|
||||
new URL(baseUrl);
|
||||
return true;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
reqBody,
|
||||
queryParams,
|
||||
validBaseUrl,
|
||||
};
|
||||
|
||||
BIN
frontend/src/components/DataConnectorOption/media/drupalwiki.jpg
Normal file
BIN
frontend/src/components/DataConnectorOption/media/drupalwiki.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.1 KiB |
@@ -3,6 +3,7 @@ import GitLab from "./gitlab.svg";
|
||||
import YouTube from "./youtube.svg";
|
||||
import Link from "./link.svg";
|
||||
import Confluence from "./confluence.jpeg";
|
||||
import DrupalWiki from "./drupalwiki.jpg";
|
||||
|
||||
const ConnectorImages = {
|
||||
github: GitHub,
|
||||
@@ -10,6 +11,7 @@ const ConnectorImages = {
|
||||
youtube: YouTube,
|
||||
websiteDepth: Link,
|
||||
confluence: Confluence,
|
||||
drupalwiki: DrupalWiki,
|
||||
};
|
||||
|
||||
export default ConnectorImages;
|
||||
|
||||
@@ -0,0 +1,190 @@
|
||||
/**
|
||||
* Copyright 2024
|
||||
*
|
||||
* Authors:
|
||||
* - Eugen Mayer (KontextWork)
|
||||
*/
|
||||
|
||||
import { useState } from "react";
|
||||
import System from "@/models/system";
|
||||
import showToast from "@/utils/toast";
|
||||
import { Warning } from "@phosphor-icons/react";
|
||||
import { Tooltip } from "react-tooltip";
|
||||
|
||||
export default function DrupalWikiOptions() {
|
||||
const [loading, setLoading] = useState(false);
|
||||
|
||||
const handleSubmit = async (e) => {
|
||||
e.preventDefault();
|
||||
const form = new FormData(e.target);
|
||||
|
||||
try {
|
||||
setLoading(true);
|
||||
showToast(
|
||||
"Fetching all pages for the given Drupal Wiki spaces - this may take a while.",
|
||||
"info",
|
||||
{
|
||||
clear: true,
|
||||
autoClose: false,
|
||||
}
|
||||
);
|
||||
const { data, error } = await System.dataConnectors.drupalwiki.collect({
|
||||
baseUrl: form.get("baseUrl"),
|
||||
spaceIds: form.get("spaceIds"),
|
||||
accessToken: form.get("accessToken"),
|
||||
});
|
||||
|
||||
if (!!error) {
|
||||
showToast(error, "error", { clear: true });
|
||||
setLoading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
showToast(
|
||||
`Pages collected from Drupal Wiki spaces ${data.spaceIds}. Output folder is ${data.destination}.`,
|
||||
"success",
|
||||
{ clear: true }
|
||||
);
|
||||
e.target.reset();
|
||||
setLoading(false);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
showToast(e.message, "error", { clear: true });
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="flex w-full">
|
||||
<div className="flex flex-col w-full px-1 md:pb-6 pb-16">
|
||||
<form className="w-full" onSubmit={handleSubmit}>
|
||||
<div className="w-full flex flex-col py-2">
|
||||
<div className="w-full flex flex-col gap-4">
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold flex gap-x-2 items-center">
|
||||
<p className="font-bold text-white">Drupal Wiki base URL</p>
|
||||
</label>
|
||||
<p className="text-xs font-normal text-theme-text-secondary">
|
||||
This is the base URL of your
|
||||
<a
|
||||
href="https://drupal-wiki.com"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="underline"
|
||||
>
|
||||
Drupal Wiki
|
||||
</a>
|
||||
.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="url"
|
||||
name="baseUrl"
|
||||
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
|
||||
placeholder="eg: https://mywiki.drupal-wiki.net, https://drupalwiki.mycompany.tld, etc..."
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
spellCheck={false}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold">
|
||||
Drupal Wiki Space IDs
|
||||
</label>
|
||||
<p className="text-xs font-normal text-theme-text-secondary">
|
||||
Comma seperated Space IDs you want to extract. See the
|
||||
<a
|
||||
href="https://help.drupal-wiki.com/node/606"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="underline"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
>
|
||||
manual
|
||||
</a>
|
||||
on how to retrieve the Space IDs. Be sure that your
|
||||
'API-Token User' has access to those spaces.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="text"
|
||||
name="spaceIds"
|
||||
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
|
||||
placeholder="eg: 12,34,69"
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
spellCheck={false}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold flex gap-x-2 items-center">
|
||||
<p className="font-bold text-white">
|
||||
Drupal Wiki API Token
|
||||
</p>
|
||||
<Warning
|
||||
size={14}
|
||||
className="ml-1 text-orange-500 cursor-pointer"
|
||||
data-tooltip-id="access-token-tooltip"
|
||||
data-tooltip-place="right"
|
||||
/>
|
||||
<Tooltip
|
||||
delayHide={300}
|
||||
id="access-token-tooltip"
|
||||
className="max-w-xs z-99"
|
||||
clickable={true}
|
||||
>
|
||||
<p className="text-sm font-light text-theme-text-primary">
|
||||
You need to provide an API token for authentication. See
|
||||
the Drupal Wiki
|
||||
<a
|
||||
href="https://help.drupal-wiki.com/node/605#2-Zugriffs-Token-generieren"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="underline"
|
||||
>
|
||||
manual
|
||||
</a>
|
||||
on how to generate an API-Token for your user.
|
||||
</p>
|
||||
</Tooltip>
|
||||
</label>
|
||||
<p className="text-xs font-normal text-theme-text-secondary">
|
||||
Access token for authentication.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="password"
|
||||
name="accessToken"
|
||||
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
|
||||
placeholder="pat:123"
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
spellCheck={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col gap-y-2 w-full pr-10">
|
||||
<button
|
||||
type="submit"
|
||||
disabled={loading}
|
||||
className="mt-2 w-full justify-center border border-slate-200 px-4 py-2 rounded-lg text-dark-text text-sm font-bold items-center flex gap-x-2 bg-slate-200 hover:bg-slate-300 hover:text-slate-800 disabled:bg-slate-300 disabled:cursor-not-allowed"
|
||||
>
|
||||
{loading ? "Collecting pages..." : "Submit"}
|
||||
</button>
|
||||
{loading && (
|
||||
<p className="text-xs text-theme-text-secondary">
|
||||
Once complete, all pages will be available for embedding into
|
||||
workspaces.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import GithubOptions from "./Connectors/Github";
|
||||
import GitlabOptions from "./Connectors/Gitlab";
|
||||
import YoutubeOptions from "./Connectors/Youtube";
|
||||
import ConfluenceOptions from "./Connectors/Confluence";
|
||||
import DrupalWikiOptions from "./Connectors/DrupalWiki";
|
||||
import { useState } from "react";
|
||||
import ConnectorOption from "./ConnectorOption";
|
||||
import WebsiteDepthOptions from "./Connectors/WebsiteDepth";
|
||||
@@ -40,6 +41,12 @@ export const getDataConnectors = (t) => ({
|
||||
description: t("connectors.confluence.description"),
|
||||
options: <ConfluenceOptions />,
|
||||
},
|
||||
drupalwiki: {
|
||||
name: "Drupal Wiki",
|
||||
image: ConnectorImages.drupalwiki,
|
||||
description: "Import Drupal Wiki spaces in a single click.",
|
||||
options: <DrupalWikiOptions />,
|
||||
},
|
||||
});
|
||||
|
||||
export default function DataConnectors() {
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
YoutubeLogo,
|
||||
} from "@phosphor-icons/react";
|
||||
import ConfluenceLogo from "@/media/dataConnectors/confluence.png";
|
||||
import DrupalWikiLogo from "@/media/dataConnectors/drupalwiki.png";
|
||||
import { toPercentString } from "@/utils/numbers";
|
||||
|
||||
function combineLikeSources(sources) {
|
||||
@@ -197,14 +198,17 @@ function parseChunkSource({ title = "", chunks = [] }) {
|
||||
!chunks.length ||
|
||||
(!chunks[0].chunkSource?.startsWith("link://") &&
|
||||
!chunks[0].chunkSource?.startsWith("confluence://") &&
|
||||
!chunks[0].chunkSource?.startsWith("github://"))
|
||||
!chunks[0].chunkSource?.startsWith("github://") &&
|
||||
!chunks[0].chunkSource?.startsWith("drupalwiki://"))
|
||||
)
|
||||
return nullResponse;
|
||||
|
||||
try {
|
||||
const url = new URL(
|
||||
chunks[0].chunkSource.split("link://")[1] ||
|
||||
chunks[0].chunkSource.split("confluence://")[1] ||
|
||||
chunks[0].chunkSource.split("github://")[1]
|
||||
chunks[0].chunkSource.split("github://")[1] ||
|
||||
chunks[0].chunkSource.split("drupalwiki://")[1]
|
||||
);
|
||||
let text = url.host + url.pathname;
|
||||
let icon = "link";
|
||||
@@ -224,6 +228,11 @@ function parseChunkSource({ title = "", chunks = [] }) {
|
||||
icon = "confluence";
|
||||
}
|
||||
|
||||
if (url.host.includes("drupal-wiki.net")) {
|
||||
text = title;
|
||||
icon = "drupalwiki";
|
||||
}
|
||||
|
||||
return {
|
||||
isUrl: true,
|
||||
href: url.toString(),
|
||||
@@ -239,10 +248,16 @@ const ConfluenceIcon = ({ ...props }) => (
|
||||
<img src={ConfluenceLogo} {...props} />
|
||||
);
|
||||
|
||||
// Patch to render DrupalWiki icon as a element like we do with Phosphor
|
||||
const DrupalWikiIcon = ({ ...props }) => (
|
||||
<img src={DrupalWikiLogo} {...props} />
|
||||
);
|
||||
|
||||
const ICONS = {
|
||||
file: FileText,
|
||||
link: Link,
|
||||
youtube: YoutubeLogo,
|
||||
github: GithubLogo,
|
||||
confluence: ConfluenceIcon,
|
||||
drupalwiki: DrupalWikiIcon,
|
||||
};
|
||||
|
||||
BIN
frontend/src/media/dataConnectors/drupalwiki.png
Normal file
BIN
frontend/src/media/dataConnectors/drupalwiki.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
@@ -162,6 +162,29 @@ const DataConnector = {
|
||||
});
|
||||
},
|
||||
},
|
||||
|
||||
drupalwiki: {
|
||||
collect: async function ({ baseUrl, spaceIds, accessToken }) {
|
||||
return await fetch(`${API_BASE}/ext/drupalwiki`, {
|
||||
method: "POST",
|
||||
headers: baseHeaders(),
|
||||
body: JSON.stringify({
|
||||
baseUrl,
|
||||
spaceIds,
|
||||
accessToken,
|
||||
}),
|
||||
})
|
||||
.then((res) => res.json())
|
||||
.then((res) => {
|
||||
if (!res.success) throw new Error(res.reason);
|
||||
return { data: res.data, error: null };
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error(e);
|
||||
return { data: null, error: e.message };
|
||||
});
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
export default DataConnector;
|
||||
|
||||
@@ -127,6 +127,27 @@ function extensionEndpoints(app) {
|
||||
}
|
||||
}
|
||||
);
|
||||
app.post(
|
||||
"/ext/drupalwiki",
|
||||
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
|
||||
async (request, response) => {
|
||||
try {
|
||||
const responseFromProcessor =
|
||||
await new CollectorApi().forwardExtensionRequest({
|
||||
endpoint: "/ext/drupalwiki",
|
||||
method: "POST",
|
||||
body: request.body,
|
||||
});
|
||||
await Telemetry.sendTelemetry("extension_invoked", {
|
||||
type: "drupalwiki",
|
||||
});
|
||||
response.status(200).json(responseFromProcessor);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
response.sendStatus(500).end();
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
module.exports = { extensionEndpoints };
|
||||
|
||||
@@ -34,7 +34,7 @@ const { DocumentSyncRun } = require('../models/documentSyncRun.js');
|
||||
continue;
|
||||
}
|
||||
|
||||
if (type === 'link' || type === 'youtube') {
|
||||
if (['link', 'youtube'].includes(type)) {
|
||||
const response = await collector.forwardExtensionRequest({
|
||||
endpoint: "/ext/resync-source-document",
|
||||
method: "POST",
|
||||
@@ -46,7 +46,7 @@ const { DocumentSyncRun } = require('../models/documentSyncRun.js');
|
||||
newContent = response?.content;
|
||||
}
|
||||
|
||||
if (type === 'confluence' || type === 'github' || type === 'gitlab') {
|
||||
if (['confluence', 'github', 'gitlab', 'drupalwiki'].includes(type)) {
|
||||
const response = await collector.forwardExtensionRequest({
|
||||
endpoint: "/ext/resync-source-document",
|
||||
method: "POST",
|
||||
|
||||
@@ -10,7 +10,14 @@ const { Telemetry } = require("./telemetry");
|
||||
const DocumentSyncQueue = {
|
||||
featureKey: "experimental_live_file_sync",
|
||||
// update the validFileTypes and .canWatch properties when adding elements here.
|
||||
validFileTypes: ["link", "youtube", "confluence", "github", "gitlab"],
|
||||
validFileTypes: [
|
||||
"link",
|
||||
"youtube",
|
||||
"confluence",
|
||||
"github",
|
||||
"gitlab",
|
||||
"drupalwiki",
|
||||
],
|
||||
defaultStaleAfter: 604800000,
|
||||
maxRepeatFailures: 5, // How many times a run can fail in a row before pruning.
|
||||
writable: [],
|
||||
@@ -52,6 +59,7 @@ const DocumentSyncQueue = {
|
||||
if (chunkSource.startsWith("confluence://")) return true; // If is a confluence document link
|
||||
if (chunkSource.startsWith("github://")) return true; // If is a GitHub file reference
|
||||
if (chunkSource.startsWith("gitlab://")) return true; // If is a GitLab file reference
|
||||
if (chunkSource.startsWith("drupalwiki://")) return true; // If is a DrupalWiki document link
|
||||
return false;
|
||||
},
|
||||
|
||||
|
||||
Reference in New Issue
Block a user