Files
anything-llm/server/endpoints/workspacesParsedFiles.js
Timothy Carambat 0fb33736da Workspace Chat with documents overhaul (#4261)
* Create parse endpoint in collector (#4212)

* create parse endpoint in collector

* revert cleanup temp util call

* lint

* remove unused cleanupTempDocuments function

* revert slug change
minor change for destinations

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>

* Add parsed files table and parse server endpoints (#4222)

* add workspace_parsed_files table + parse endpoints/models

* remove dev api parse endpoint

* remove unneeded imports

* iterate over all files + remove unneeded update function + update telemetry debounce

* Upload UI/UX context window check + frontend alert (#4230)

* prompt user to embed if exceeds prompt window + handle embed + handle cancel

* add tokenCountEstimate to workspace_parsed_files + optimizations

* use util for path locations + use safeJsonParse

* add modal for user decision on overflow of context window

* lint

* dynamic fetching of provider/model combo + inject parsed documents

* remove unneeded comments

* popup ui for attaching/removing files + warning to embed + wip fetching states on update

* remove prop drilling, fetch files/limits directly in attach files popup

* rework ux of FE + BE optimizations

* fix ux of FE + BE optimizations

* Implement bidirectional sync for parsed file states
linting
small changes and comments

* move parse support to another endpoint file
simplify calls and loading of records

* button borders

* enable default users to upload parsed files but NOT embed

* delete cascade on user/workspace/thread deletion to remove parsedFileRecord

* enable bgworker with "always" jobs and optional document sync jobs
orphan document job: Will find any broken reference files to prevent overpollution of the storage folder. This will run 10s after boot and every 12hr after

* change run timeout for orphan job to 1m to allow settling before spawning a worker

* linting and cleanup pr

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>

* dev build

* fix tooltip hiding during embedding overflow files

* prevent crash log from ERRNO on parse files

* unused import

* update docs link

* Migrate parsed-files to GET endpoint
patch logic for grabbing models names from utils
better handling for undetermined context windows (null instead of Pos_INIFI)
UI placeholder for null context windows

* patch URL

---------

Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2025-08-11 09:26:19 -07:00

200 lines
6.4 KiB
JavaScript

const { reqBody, multiUserMode, userFromSession } = require("../utils/http");
const { handleFileUpload } = require("../utils/files/multer");
const { validatedRequest } = require("../utils/middleware/validatedRequest");
const { Telemetry } = require("../models/telemetry");
const {
flexUserRoleValid,
ROLES,
} = require("../utils/middleware/multiUserProtected");
const { EventLogs } = require("../models/eventLogs");
const { validWorkspaceSlug } = require("../utils/middleware/validWorkspace");
const { CollectorApi } = require("../utils/collectorApi");
const { WorkspaceThread } = require("../models/workspaceThread");
const { WorkspaceParsedFiles } = require("../models/workspaceParsedFiles");
function workspaceParsedFilesEndpoints(app) {
if (!app) return;
app.get(
"/workspace/:slug/parsed-files",
[validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug],
async (request, response) => {
try {
const threadSlug = request.query.threadSlug || null;
const user = await userFromSession(request, response);
const workspace = response.locals.workspace;
const thread = threadSlug
? await WorkspaceThread.get({ slug: String(threadSlug) })
: null;
const { files, contextWindow, currentContextTokenCount } =
await WorkspaceParsedFiles.getContextMetadataAndLimits(
workspace,
thread || null,
multiUserMode(response) ? user : null
);
return response
.status(200)
.json({ files, contextWindow, currentContextTokenCount });
} catch (e) {
console.error(e.message, e);
return response.sendStatus(500).end();
}
}
);
app.delete(
"/workspace/:slug/delete-parsed-files",
[validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug],
async function (request, response) {
try {
const { fileIds = [] } = reqBody(request);
if (!fileIds.length) return response.sendStatus(400).end();
const success = await WorkspaceParsedFiles.delete({
id: { in: fileIds.map((id) => parseInt(id)) },
});
return response.status(success ? 200 : 500).end();
} catch (e) {
console.error(e.message, e);
return response.sendStatus(500).end();
}
}
);
app.post(
"/workspace/:slug/embed-parsed-file/:fileId",
[
validatedRequest,
// Embed is still an admin/manager only feature
flexUserRoleValid([ROLES.admin, ROLES.manager]),
validWorkspaceSlug,
],
async function (request, response) {
const { fileId = null } = request.params;
try {
const user = await userFromSession(request, response);
const workspace = response.locals.workspace;
if (!fileId) return response.sendStatus(400).end();
const { success, error, document } =
await WorkspaceParsedFiles.moveToDocumentsAndEmbed(fileId, workspace);
if (!success) {
return response.status(500).json({
success: false,
error: error || "Failed to embed file",
});
}
await Telemetry.sendTelemetry("document_embedded");
await EventLogs.logEvent(
"document_embedded",
{
documentName: document?.name || "unknown",
workspaceId: workspace.id,
},
user?.id
);
return response.status(200).json({
success: true,
error: null,
document,
});
} catch (e) {
console.error(e.message, e);
return response.sendStatus(500).end();
} finally {
if (!fileId) return;
await WorkspaceParsedFiles.delete({ id: parseInt(fileId) });
}
}
);
app.post(
"/workspace/:slug/parse",
[
validatedRequest,
flexUserRoleValid([ROLES.all]),
handleFileUpload,
validWorkspaceSlug,
],
async function (request, response) {
try {
const user = await userFromSession(request, response);
const workspace = response.locals.workspace;
const Collector = new CollectorApi();
const { originalname } = request.file;
const processingOnline = await Collector.online();
if (!processingOnline) {
return response.status(500).json({
success: false,
error: `Document processing API is not online. Document ${originalname} will not be parsed.`,
});
}
const { success, reason, documents } =
await Collector.parseDocument(originalname);
if (!success || !documents?.[0]) {
return response.status(500).json({
success: false,
error: reason || "No document returned from collector",
});
}
// Get thread ID if we have a slug
const { threadSlug = null } = reqBody(request);
const thread = threadSlug
? await WorkspaceThread.get({
slug: String(threadSlug),
workspace_id: workspace.id,
user_id: user?.id || null,
})
: null;
const files = await Promise.all(
documents.map(async (doc) => {
const metadata = { ...doc };
// Strip out pageContent
delete metadata.pageContent;
const filename = `${originalname}-${doc.id}.json`;
const { file, error: dbError } = await WorkspaceParsedFiles.create({
filename,
workspaceId: workspace.id,
userId: user?.id || null,
threadId: thread?.id || null,
metadata: JSON.stringify(metadata),
tokenCountEstimate: doc.token_count_estimate || 0,
});
if (dbError) throw new Error(dbError);
return file;
})
);
Collector.log(`Document ${originalname} parsed successfully.`);
await EventLogs.logEvent(
"document_uploaded_to_chat",
{
documentName: originalname,
workspace: workspace.slug,
thread: thread?.name || null,
},
user?.id
);
return response.status(200).json({
success: true,
error: null,
files,
});
} catch (e) {
console.error(e.message, e);
return response.sendStatus(500).end();
}
}
);
}
module.exports = { workspaceParsedFilesEndpoints };