diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 64f17ec35..e2ec82a3a 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -132,8 +132,9 @@ function writeToServerDocuments({ if (!fs.existsSync(destination)) fs.mkdirSync(destination, { recursive: true }); + const safeFilename = sanitizeFileName(filename); const destinationFilePath = normalizePath( - path.resolve(destination, filename) + ".json" + path.resolve(destination, safeFilename) + ".json" ); fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), { @@ -210,10 +211,19 @@ function normalizePath(filepath = "") { return result; } +/** + * Strips characters that are illegal in Windows filenames, including Unicode + * quotation marks (U+201C, U+201D, etc.) that can get corrupted into ASCII + * double-quotes during charset conversion in the upload pipeline. + * @param {string} fileName - The filename to sanitize. + * @returns {string} - The sanitized filename. + */ function sanitizeFileName(fileName) { if (!fileName) return fileName; - //eslint-disable-next-line - return fileName.replace(/[<>:"\/\\|?*]/g, ""); + return fileName.replace( + /[<>:"/\\|?*\u201C\u201D\u201E\u201F\u2018\u2019\u201A\u201B]/g, + "" + ); } module.exports = { diff --git a/server/utils/chats/apiChatHandler.js b/server/utils/chats/apiChatHandler.js index a0dc13bd9..2770c8770 100644 --- a/server/utils/chats/apiChatHandler.js +++ b/server/utils/chats/apiChatHandler.js @@ -17,7 +17,12 @@ const { Telemetry } = require("../../models/telemetry"); const { CollectorApi } = require("../collectorApi"); const fs = require("fs"); const path = require("path"); -const { hotdirPath, normalizePath, isWithin } = require("../files"); +const { + hotdirPath, + normalizePath, + isWithin, + sanitizeFileName, +} = require("../files"); /** * @typedef ResponseObject * @property {string} id - uuid of response @@ -72,8 +77,8 @@ async function processDocumentAttachments(attachments = []) { if (dataUriMatch) base64Data = dataUriMatch[1]; const buffer = Buffer.from(base64Data, "base64"); - const filename = normalizePath( - attachment.name || `attachment-${uuidv4()}` + const filename = sanitizeFileName( + normalizePath(attachment.name || `attachment-${uuidv4()}`) ); const filePath = normalizePath(path.join(hotdirPath, filename)); if (!isWithin(hotdirPath, filePath)) diff --git a/server/utils/files/index.js b/server/utils/files/index.js index 37fcd4620..740a0869d 100644 --- a/server/utils/files/index.js +++ b/server/utils/files/index.js @@ -284,6 +284,21 @@ function normalizePath(filepath = "") { return result; } +/** + * Strips characters that are illegal in Windows filenames, including Unicode + * quotation marks (U+201C, U+201D, etc.) that can get corrupted into ASCII + * double-quotes during charset conversion in the upload pipeline. + * @param {string} fileName - The filename to sanitize. + * @returns {string} - The sanitized filename. + */ +function sanitizeFileName(fileName) { + if (!fileName) return fileName; + return fileName.replace( + /[<>:"/\\|?*\u201C\u201D\u201E\u201F\u2018\u2019\u201A\u201B]/g, + "" + ); +} + // Check if the vector-cache folder is empty or not // useful for it the user is changing embedders as this will // break the previous cache. @@ -500,4 +515,5 @@ module.exports = { purgeEntireVectorCache, getDocumentsByFolder, hotdirPath, + sanitizeFileName, }; diff --git a/server/utils/files/multer.js b/server/utils/files/multer.js index ee0de4b11..74c0704a8 100644 --- a/server/utils/files/multer.js +++ b/server/utils/files/multer.js @@ -2,7 +2,7 @@ const multer = require("multer"); const path = require("path"); const fs = require("fs"); const { v4 } = require("uuid"); -const { normalizePath } = require("."); +const { normalizePath, sanitizeFileName } = require("."); /** * Handle File uploads for auto-uploading. @@ -17,8 +17,8 @@ const fileUploadStorage = multer.diskStorage({ cb(null, uploadOutput); }, filename: function (_, file, cb) { - file.originalname = normalizePath( - Buffer.from(file.originalname, "latin1").toString("utf8") + file.originalname = sanitizeFileName( + normalizePath(Buffer.from(file.originalname, "latin1").toString("utf8")) ); cb(null, file.originalname); }, @@ -37,8 +37,8 @@ const fileAPIUploadStorage = multer.diskStorage({ cb(null, uploadOutput); }, filename: function (_, file, cb) { - file.originalname = normalizePath( - Buffer.from(file.originalname, "latin1").toString("utf8") + file.originalname = sanitizeFileName( + normalizePath(Buffer.from(file.originalname, "latin1").toString("utf8")) ); cb(null, file.originalname); }, @@ -55,8 +55,8 @@ const assetUploadStorage = multer.diskStorage({ return cb(null, uploadOutput); }, filename: function (_, file, cb) { - file.originalname = normalizePath( - Buffer.from(file.originalname, "latin1").toString("utf8") + file.originalname = sanitizeFileName( + normalizePath(Buffer.from(file.originalname, "latin1").toString("utf8")) ); cb(null, file.originalname); },