mirror of
https://github.com/Mintplex-Labs/anything-llm
synced 2026-04-25 17:15:37 +02:00
* fix(collector): infer file extension from Content-Type for URLs without explicit extensions When downloading files from URLs like https://arxiv.org/pdf/2307.10265, the path has no recognizable file extension. The downloaded file gets saved without an extension (or with a nonsensical one like .10265), causing processSingleFile to reject it with 'File extension .10265 not supported for parsing'. Fix: after downloading, check if the filename has a supported file extension. If not, inspect the response Content-Type header and map it to the correct extension using the existing ACCEPTED_MIMES table. For example, a response with Content-Type: application/pdf will cause the file to be saved with a .pdf extension, allowing it to be processed correctly. Fixes #4513 * small refactor --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
191 lines
6.4 KiB
JavaScript
191 lines
6.4 KiB
JavaScript
const path = require("path");
|
|
const { validURL } = require("../../utils/url");
|
|
const { processSingleFile } = require("../../processSingleFile");
|
|
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
|
|
const { ACCEPTED_MIMES } = require("../../utils/constants");
|
|
const { validYoutubeVideoUrl } = require("../../utils/url");
|
|
|
|
/**
|
|
* Parse a Content-Type header value and return the MIME type without charset or other parameters.
|
|
* @param {string|null} contentTypeHeader - The raw Content-Type header value
|
|
* @returns {string|null} - The MIME type (e.g., "application/pdf") or null
|
|
*/
|
|
function parseContentType(contentTypeHeader) {
|
|
if (!contentTypeHeader) return null;
|
|
return contentTypeHeader.toLowerCase().split(";")[0].trim() || null;
|
|
}
|
|
|
|
/**
|
|
* Get the content type of a resource
|
|
* - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
|
|
* @param {string} url - The URL to get the content type of
|
|
* @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
|
|
*/
|
|
async function getContentTypeFromURL(url) {
|
|
try {
|
|
if (!url || typeof url !== "string" || !validURL(url))
|
|
return { success: false, reason: "Not a valid URL.", contentType: null };
|
|
|
|
const abortController = new AbortController();
|
|
const timeout = setTimeout(() => {
|
|
abortController.abort();
|
|
console.error("Timeout fetching content type for URL:", url.toString());
|
|
}, 5_000);
|
|
|
|
const res = await fetch(url, {
|
|
method: "HEAD",
|
|
signal: abortController.signal,
|
|
}).finally(() => clearTimeout(timeout));
|
|
|
|
if (!res.ok)
|
|
return {
|
|
success: false,
|
|
reason: `HTTP ${res.status}: ${res.statusText}`,
|
|
contentType: null,
|
|
};
|
|
|
|
const contentTypeWithoutCharset = parseContentType(
|
|
res.headers.get("Content-Type")
|
|
);
|
|
if (!contentTypeWithoutCharset)
|
|
return {
|
|
success: false,
|
|
reason: "No Content-Type found.",
|
|
contentType: null,
|
|
};
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
contentType: contentTypeWithoutCharset,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
success: false,
|
|
reason: `Error: ${error.message}`,
|
|
contentType: null,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Normalize the result object based on the saveAsDocument flag
|
|
* @param {Object} result - The result object to normalize
|
|
* @param {boolean} result.success - Whether the result is successful
|
|
* @param {string|null} result.reason - The reason for the result
|
|
* @param {Object[]} result.documents - The documents from the result
|
|
* @param {string|null} result.content - The content of the result
|
|
* @param {boolean} result.saveAsDocument - Whether to save the content as a document. Default is true
|
|
* @returns {{success: boolean, reason: string|null, documents: Object[], content: string|null}} - The normalized result object
|
|
*/
|
|
function returnResult({
|
|
success,
|
|
reason,
|
|
documents,
|
|
content,
|
|
saveAsDocument = true,
|
|
} = {}) {
|
|
if (!saveAsDocument) {
|
|
return {
|
|
success,
|
|
content,
|
|
};
|
|
} else return { success, reason, documents };
|
|
}
|
|
|
|
/**
|
|
* Determine the content type of a link - should be a URL
|
|
* @param {string} uri - The link to determine the content type of
|
|
* @returns {Promise<{contentType: string|null, processVia: 'web' | 'file' | 'youtube'}>} - The content type of the link
|
|
*/
|
|
async function determineContentType(uri) {
|
|
let processVia = "web";
|
|
|
|
// Dont check for content type if it is a YouTube video URL
|
|
if (validYoutubeVideoUrl(uri))
|
|
return { contentType: "text/html", processVia: "youtube" };
|
|
|
|
return await getContentTypeFromURL(uri)
|
|
.then((result) => {
|
|
if (!!result.reason) console.error(result.reason);
|
|
|
|
// If the content type is not text/html or text/plain, and it is in the ACCEPTED_MIMES,
|
|
// then we can process it as a file
|
|
if (
|
|
!!result.contentType &&
|
|
!["text/html", "text/plain"].includes(result.contentType) &&
|
|
result.contentType in ACCEPTED_MIMES
|
|
)
|
|
processVia = "file";
|
|
|
|
return { contentType: result.contentType, processVia };
|
|
})
|
|
.catch((error) => {
|
|
console.error("Error getting content type from URL", error);
|
|
return { contentType: null, processVia };
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Process a link as a file
|
|
* @param {string} uri - The link to process as a file
|
|
* @param {boolean} saveAsDocument - Whether to save the content as a document. Default is true
|
|
* @returns {Promise<{success: boolean, reason: string|null, documents: Object[], content: string|null, saveAsDocument: boolean}>} - The content of the file
|
|
*/
|
|
async function processAsFile({ uri, saveAsDocument = true }) {
|
|
const fileContentResult = await downloadURIToFile(uri);
|
|
if (!fileContentResult.success)
|
|
return returnResult({
|
|
success: false,
|
|
reason: fileContentResult.reason,
|
|
documents: [],
|
|
content: null,
|
|
saveAsDocument,
|
|
});
|
|
|
|
const fileFilePath = fileContentResult.fileLocation;
|
|
const targetFilename = path.basename(fileFilePath);
|
|
|
|
/**
|
|
* If the saveAsDocument is false, we are only interested in the text content
|
|
* and can ignore the file as a document by using `parseOnly` in the options.
|
|
* This will send the file to the Direct Uploads folder instead of the Documents folder.
|
|
* that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
|
|
* is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
|
|
*
|
|
* TODO: Improve this process via a new option that will instantly delete the file after processing
|
|
* if we find we dont need this file ever after processing.
|
|
*/
|
|
const processSingleFileResult = await processSingleFile(targetFilename, {
|
|
parseOnly: saveAsDocument === false,
|
|
});
|
|
if (!processSingleFileResult.success) {
|
|
return returnResult({
|
|
success: false,
|
|
reason: processSingleFileResult.reason,
|
|
documents: [],
|
|
content: null,
|
|
saveAsDocument,
|
|
});
|
|
}
|
|
|
|
// If we intend to return only the text content, return the content from the file
|
|
// and then delete the file - otherwise it will be saved as a document
|
|
if (!saveAsDocument) {
|
|
return returnResult({
|
|
success: true,
|
|
content: processSingleFileResult.documents[0].pageContent,
|
|
saveAsDocument,
|
|
});
|
|
}
|
|
|
|
return processSingleFileResult;
|
|
}
|
|
|
|
module.exports = {
|
|
parseContentType,
|
|
returnResult,
|
|
getContentTypeFromURL,
|
|
determineContentType,
|
|
processAsFile,
|
|
};
|