mirror of
https://github.com/Mintplex-Labs/anything-llm
synced 2026-04-25 17:15:37 +02:00
* Create parse endpoint in collector (#4212) * create parse endpoint in collector * revert cleanup temp util call * lint * remove unused cleanupTempDocuments function * revert slug change minor change for destinations --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * Add parsed files table and parse server endpoints (#4222) * add workspace_parsed_files table + parse endpoints/models * remove dev api parse endpoint * remove unneeded imports * iterate over all files + remove unneeded update function + update telemetry debounce * Upload UI/UX context window check + frontend alert (#4230) * prompt user to embed if exceeds prompt window + handle embed + handle cancel * add tokenCountEstimate to workspace_parsed_files + optimizations * use util for path locations + use safeJsonParse * add modal for user decision on overflow of context window * lint * dynamic fetching of provider/model combo + inject parsed documents * remove unneeded comments * popup ui for attaching/removing files + warning to embed + wip fetching states on update * remove prop drilling, fetch files/limits directly in attach files popup * rework ux of FE + BE optimizations * fix ux of FE + BE optimizations * Implement bidirectional sync for parsed file states linting small changes and comments * move parse support to another endpoint file simplify calls and loading of records * button borders * enable default users to upload parsed files but NOT embed * delete cascade on user/workspace/thread deletion to remove parsedFileRecord * enable bgworker with "always" jobs and optional document sync jobs orphan document job: Will find any broken reference files to prevent overpollution of the storage folder. This will run 10s after boot and every 12hr after * change run timeout for orphan job to 1m to allow settling before spawning a worker * linting and cleanup pr --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * dev build * fix tooltip hiding during embedding overflow files * prevent crash log from ERRNO on parse files * unused import * update docs link * Migrate parsed-files to GET endpoint patch logic for grabbing models names from utils better handling for undetermined context windows (null instead of Pos_INIFI) UI placeholder for null context windows * patch URL --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
200 lines
6.4 KiB
JavaScript
200 lines
6.4 KiB
JavaScript
const { reqBody, multiUserMode, userFromSession } = require("../utils/http");
|
|
const { handleFileUpload } = require("../utils/files/multer");
|
|
const { validatedRequest } = require("../utils/middleware/validatedRequest");
|
|
const { Telemetry } = require("../models/telemetry");
|
|
const {
|
|
flexUserRoleValid,
|
|
ROLES,
|
|
} = require("../utils/middleware/multiUserProtected");
|
|
const { EventLogs } = require("../models/eventLogs");
|
|
const { validWorkspaceSlug } = require("../utils/middleware/validWorkspace");
|
|
const { CollectorApi } = require("../utils/collectorApi");
|
|
const { WorkspaceThread } = require("../models/workspaceThread");
|
|
const { WorkspaceParsedFiles } = require("../models/workspaceParsedFiles");
|
|
|
|
function workspaceParsedFilesEndpoints(app) {
|
|
if (!app) return;
|
|
|
|
app.get(
|
|
"/workspace/:slug/parsed-files",
|
|
[validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug],
|
|
async (request, response) => {
|
|
try {
|
|
const threadSlug = request.query.threadSlug || null;
|
|
const user = await userFromSession(request, response);
|
|
const workspace = response.locals.workspace;
|
|
const thread = threadSlug
|
|
? await WorkspaceThread.get({ slug: String(threadSlug) })
|
|
: null;
|
|
const { files, contextWindow, currentContextTokenCount } =
|
|
await WorkspaceParsedFiles.getContextMetadataAndLimits(
|
|
workspace,
|
|
thread || null,
|
|
multiUserMode(response) ? user : null
|
|
);
|
|
|
|
return response
|
|
.status(200)
|
|
.json({ files, contextWindow, currentContextTokenCount });
|
|
} catch (e) {
|
|
console.error(e.message, e);
|
|
return response.sendStatus(500).end();
|
|
}
|
|
}
|
|
);
|
|
|
|
app.delete(
|
|
"/workspace/:slug/delete-parsed-files",
|
|
[validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug],
|
|
async function (request, response) {
|
|
try {
|
|
const { fileIds = [] } = reqBody(request);
|
|
if (!fileIds.length) return response.sendStatus(400).end();
|
|
const success = await WorkspaceParsedFiles.delete({
|
|
id: { in: fileIds.map((id) => parseInt(id)) },
|
|
});
|
|
return response.status(success ? 200 : 500).end();
|
|
} catch (e) {
|
|
console.error(e.message, e);
|
|
return response.sendStatus(500).end();
|
|
}
|
|
}
|
|
);
|
|
|
|
app.post(
|
|
"/workspace/:slug/embed-parsed-file/:fileId",
|
|
[
|
|
validatedRequest,
|
|
// Embed is still an admin/manager only feature
|
|
flexUserRoleValid([ROLES.admin, ROLES.manager]),
|
|
validWorkspaceSlug,
|
|
],
|
|
async function (request, response) {
|
|
const { fileId = null } = request.params;
|
|
try {
|
|
const user = await userFromSession(request, response);
|
|
const workspace = response.locals.workspace;
|
|
|
|
if (!fileId) return response.sendStatus(400).end();
|
|
const { success, error, document } =
|
|
await WorkspaceParsedFiles.moveToDocumentsAndEmbed(fileId, workspace);
|
|
|
|
if (!success) {
|
|
return response.status(500).json({
|
|
success: false,
|
|
error: error || "Failed to embed file",
|
|
});
|
|
}
|
|
|
|
await Telemetry.sendTelemetry("document_embedded");
|
|
await EventLogs.logEvent(
|
|
"document_embedded",
|
|
{
|
|
documentName: document?.name || "unknown",
|
|
workspaceId: workspace.id,
|
|
},
|
|
user?.id
|
|
);
|
|
|
|
return response.status(200).json({
|
|
success: true,
|
|
error: null,
|
|
document,
|
|
});
|
|
} catch (e) {
|
|
console.error(e.message, e);
|
|
return response.sendStatus(500).end();
|
|
} finally {
|
|
if (!fileId) return;
|
|
await WorkspaceParsedFiles.delete({ id: parseInt(fileId) });
|
|
}
|
|
}
|
|
);
|
|
|
|
app.post(
|
|
"/workspace/:slug/parse",
|
|
[
|
|
validatedRequest,
|
|
flexUserRoleValid([ROLES.all]),
|
|
handleFileUpload,
|
|
validWorkspaceSlug,
|
|
],
|
|
async function (request, response) {
|
|
try {
|
|
const user = await userFromSession(request, response);
|
|
const workspace = response.locals.workspace;
|
|
const Collector = new CollectorApi();
|
|
const { originalname } = request.file;
|
|
const processingOnline = await Collector.online();
|
|
|
|
if (!processingOnline) {
|
|
return response.status(500).json({
|
|
success: false,
|
|
error: `Document processing API is not online. Document ${originalname} will not be parsed.`,
|
|
});
|
|
}
|
|
|
|
const { success, reason, documents } =
|
|
await Collector.parseDocument(originalname);
|
|
if (!success || !documents?.[0]) {
|
|
return response.status(500).json({
|
|
success: false,
|
|
error: reason || "No document returned from collector",
|
|
});
|
|
}
|
|
|
|
// Get thread ID if we have a slug
|
|
const { threadSlug = null } = reqBody(request);
|
|
const thread = threadSlug
|
|
? await WorkspaceThread.get({
|
|
slug: String(threadSlug),
|
|
workspace_id: workspace.id,
|
|
user_id: user?.id || null,
|
|
})
|
|
: null;
|
|
const files = await Promise.all(
|
|
documents.map(async (doc) => {
|
|
const metadata = { ...doc };
|
|
// Strip out pageContent
|
|
delete metadata.pageContent;
|
|
const filename = `${originalname}-${doc.id}.json`;
|
|
const { file, error: dbError } = await WorkspaceParsedFiles.create({
|
|
filename,
|
|
workspaceId: workspace.id,
|
|
userId: user?.id || null,
|
|
threadId: thread?.id || null,
|
|
metadata: JSON.stringify(metadata),
|
|
tokenCountEstimate: doc.token_count_estimate || 0,
|
|
});
|
|
|
|
if (dbError) throw new Error(dbError);
|
|
return file;
|
|
})
|
|
);
|
|
|
|
Collector.log(`Document ${originalname} parsed successfully.`);
|
|
await EventLogs.logEvent(
|
|
"document_uploaded_to_chat",
|
|
{
|
|
documentName: originalname,
|
|
workspace: workspace.slug,
|
|
thread: thread?.name || null,
|
|
},
|
|
user?.id
|
|
);
|
|
|
|
return response.status(200).json({
|
|
success: true,
|
|
error: null,
|
|
files,
|
|
});
|
|
} catch (e) {
|
|
console.error(e.message, e);
|
|
return response.sendStatus(500).end();
|
|
}
|
|
}
|
|
);
|
|
}
|
|
|
|
module.exports = { workspaceParsedFilesEndpoints };
|