Files
anything-llm/collector/processLink/helpers/index.js
Yitong Li 2f7a818744 fix(collector): infer file extension from Content-Type for URLs without explicit extensions (#5252)
* fix(collector): infer file extension from Content-Type for URLs without explicit extensions

When downloading files from URLs like https://arxiv.org/pdf/2307.10265,
the path has no recognizable file extension. The downloaded file gets
saved without an extension (or with a nonsensical one like .10265),
causing processSingleFile to reject it with 'File extension .10265
not supported for parsing'.

Fix: after downloading, check if the filename has a supported file
extension. If not, inspect the response Content-Type header and map
it to the correct extension using the existing ACCEPTED_MIMES table.

For example, a response with Content-Type: application/pdf will cause
the file to be saved with a .pdf extension, allowing it to be processed
correctly.

Fixes #4513

* small refactor

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-03-23 09:40:22 -07:00

191 lines
6.4 KiB
JavaScript

const path = require("path");
const { validURL } = require("../../utils/url");
const { processSingleFile } = require("../../processSingleFile");
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
const { ACCEPTED_MIMES } = require("../../utils/constants");
const { validYoutubeVideoUrl } = require("../../utils/url");
/**
* Parse a Content-Type header value and return the MIME type without charset or other parameters.
* @param {string|null} contentTypeHeader - The raw Content-Type header value
* @returns {string|null} - The MIME type (e.g., "application/pdf") or null
*/
function parseContentType(contentTypeHeader) {
if (!contentTypeHeader) return null;
return contentTypeHeader.toLowerCase().split(";")[0].trim() || null;
}
/**
* Get the content type of a resource
* - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
* @param {string} url - The URL to get the content type of
* @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
*/
async function getContentTypeFromURL(url) {
try {
if (!url || typeof url !== "string" || !validURL(url))
return { success: false, reason: "Not a valid URL.", contentType: null };
const abortController = new AbortController();
const timeout = setTimeout(() => {
abortController.abort();
console.error("Timeout fetching content type for URL:", url.toString());
}, 5_000);
const res = await fetch(url, {
method: "HEAD",
signal: abortController.signal,
}).finally(() => clearTimeout(timeout));
if (!res.ok)
return {
success: false,
reason: `HTTP ${res.status}: ${res.statusText}`,
contentType: null,
};
const contentTypeWithoutCharset = parseContentType(
res.headers.get("Content-Type")
);
if (!contentTypeWithoutCharset)
return {
success: false,
reason: "No Content-Type found.",
contentType: null,
};
return {
success: true,
reason: null,
contentType: contentTypeWithoutCharset,
};
} catch (error) {
return {
success: false,
reason: `Error: ${error.message}`,
contentType: null,
};
}
}
/**
* Normalize the result object based on the saveAsDocument flag
* @param {Object} result - The result object to normalize
* @param {boolean} result.success - Whether the result is successful
* @param {string|null} result.reason - The reason for the result
* @param {Object[]} result.documents - The documents from the result
* @param {string|null} result.content - The content of the result
* @param {boolean} result.saveAsDocument - Whether to save the content as a document. Default is true
* @returns {{success: boolean, reason: string|null, documents: Object[], content: string|null}} - The normalized result object
*/
function returnResult({
success,
reason,
documents,
content,
saveAsDocument = true,
} = {}) {
if (!saveAsDocument) {
return {
success,
content,
};
} else return { success, reason, documents };
}
/**
* Determine the content type of a link - should be a URL
* @param {string} uri - The link to determine the content type of
* @returns {Promise<{contentType: string|null, processVia: 'web' | 'file' | 'youtube'}>} - The content type of the link
*/
async function determineContentType(uri) {
let processVia = "web";
// Dont check for content type if it is a YouTube video URL
if (validYoutubeVideoUrl(uri))
return { contentType: "text/html", processVia: "youtube" };
return await getContentTypeFromURL(uri)
.then((result) => {
if (!!result.reason) console.error(result.reason);
// If the content type is not text/html or text/plain, and it is in the ACCEPTED_MIMES,
// then we can process it as a file
if (
!!result.contentType &&
!["text/html", "text/plain"].includes(result.contentType) &&
result.contentType in ACCEPTED_MIMES
)
processVia = "file";
return { contentType: result.contentType, processVia };
})
.catch((error) => {
console.error("Error getting content type from URL", error);
return { contentType: null, processVia };
});
}
/**
* Process a link as a file
* @param {string} uri - The link to process as a file
* @param {boolean} saveAsDocument - Whether to save the content as a document. Default is true
* @returns {Promise<{success: boolean, reason: string|null, documents: Object[], content: string|null, saveAsDocument: boolean}>} - The content of the file
*/
async function processAsFile({ uri, saveAsDocument = true }) {
const fileContentResult = await downloadURIToFile(uri);
if (!fileContentResult.success)
return returnResult({
success: false,
reason: fileContentResult.reason,
documents: [],
content: null,
saveAsDocument,
});
const fileFilePath = fileContentResult.fileLocation;
const targetFilename = path.basename(fileFilePath);
/**
* If the saveAsDocument is false, we are only interested in the text content
* and can ignore the file as a document by using `parseOnly` in the options.
* This will send the file to the Direct Uploads folder instead of the Documents folder.
* that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
* is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
*
* TODO: Improve this process via a new option that will instantly delete the file after processing
* if we find we dont need this file ever after processing.
*/
const processSingleFileResult = await processSingleFile(targetFilename, {
parseOnly: saveAsDocument === false,
});
if (!processSingleFileResult.success) {
return returnResult({
success: false,
reason: processSingleFileResult.reason,
documents: [],
content: null,
saveAsDocument,
});
}
// If we intend to return only the text content, return the content from the file
// and then delete the file - otherwise it will be saved as a document
if (!saveAsDocument) {
return returnResult({
success: true,
content: processSingleFileResult.documents[0].pageContent,
saveAsDocument,
});
}
return processSingleFileResult;
}
module.exports = {
parseContentType,
returnResult,
getContentTypeFromURL,
determineContentType,
processAsFile,
};