mirror of
https://github.com/Mintplex-Labs/anything-llm
synced 2026-04-25 17:15:37 +02:00
fix(collector): infer file extension from Content-Type for URLs without explicit extensions (#5252)
* fix(collector): infer file extension from Content-Type for URLs without explicit extensions When downloading files from URLs like https://arxiv.org/pdf/2307.10265, the path has no recognizable file extension. The downloaded file gets saved without an extension (or with a nonsensical one like .10265), causing processSingleFile to reject it with 'File extension .10265 not supported for parsing'. Fix: after downloading, check if the filename has a supported file extension. If not, inspect the response Content-Type header and map it to the correct extension using the existing ACCEPTED_MIMES table. For example, a response with Content-Type: application/pdf will cause the file to be saved with a .pdf extension, allowing it to be processed correctly. Fixes #4513 * small refactor --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
96
collector/__tests__/utils/downloadURIToFile/index.test.js
Normal file
96
collector/__tests__/utils/downloadURIToFile/index.test.js
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
const path = require("path");
|
||||||
|
const { SUPPORTED_FILETYPE_CONVERTERS } = require("../../../utils/constants");
|
||||||
|
const { mimeToExtension } = require("../../../utils/downloadURIToFile");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simulates the filename-building logic from downloadURIToFile
|
||||||
|
* to verify extension inference works correctly.
|
||||||
|
*/
|
||||||
|
function buildFilenameWithExtension(sluggedFilename, contentType) {
|
||||||
|
const existingExt = path.extname(sluggedFilename).toLowerCase();
|
||||||
|
if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(existingExt)) {
|
||||||
|
const mimeType = contentType?.toLowerCase()?.split(";")[0]?.trim();
|
||||||
|
const inferredExt = mimeToExtension(mimeType);
|
||||||
|
if (inferredExt) {
|
||||||
|
return sluggedFilename + inferredExt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sluggedFilename;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("mimeToExtension", () => {
|
||||||
|
test("returns null for invalid or unknown input", () => {
|
||||||
|
expect(mimeToExtension(null)).toBeNull();
|
||||||
|
expect(mimeToExtension(undefined)).toBeNull();
|
||||||
|
expect(mimeToExtension("application/octet-stream")).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns first extension from ACCEPTED_MIMES for known types", () => {
|
||||||
|
expect(mimeToExtension("application/pdf")).toBe(".pdf");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("buildFilenameWithExtension", () => {
|
||||||
|
test("appends .pdf when URL path has no recognized extension (arxiv case)", () => {
|
||||||
|
// Simulates: https://arxiv.org/pdf/2307.10265
|
||||||
|
// slugify produces something like "arxiv.org-pdf-230710265"
|
||||||
|
const filename = "arxiv.org-pdf-230710265";
|
||||||
|
const result = buildFilenameWithExtension(filename, "application/pdf");
|
||||||
|
expect(result).toBe("arxiv.org-pdf-230710265.pdf");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("appends .pdf when URL has numeric-looking extension", () => {
|
||||||
|
// path.extname("arxiv.org-pdf-2307.10265") => ".10265" which is not in SUPPORTED_FILETYPE_CONVERTERS
|
||||||
|
const filename = "arxiv.org-pdf-2307.10265";
|
||||||
|
const result = buildFilenameWithExtension(
|
||||||
|
filename,
|
||||||
|
"application/pdf; charset=utf-8"
|
||||||
|
);
|
||||||
|
expect(result).toBe("arxiv.org-pdf-2307.10265.pdf");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("does NOT append extension when file already has a supported extension", () => {
|
||||||
|
const filename = "example.com-document.pdf";
|
||||||
|
const result = buildFilenameWithExtension(filename, "application/pdf");
|
||||||
|
expect(result).toBe("example.com-document.pdf");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("does NOT append extension when file has .txt extension", () => {
|
||||||
|
const filename = "example.com-readme.txt";
|
||||||
|
const result = buildFilenameWithExtension(filename, "text/plain");
|
||||||
|
expect(result).toBe("example.com-readme.txt");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("does not append extension for unknown content type", () => {
|
||||||
|
const filename = "example.com-binary-blob";
|
||||||
|
const result = buildFilenameWithExtension(
|
||||||
|
filename,
|
||||||
|
"application/octet-stream"
|
||||||
|
);
|
||||||
|
expect(result).toBe("example.com-binary-blob");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("does not append extension when content type is null", () => {
|
||||||
|
const filename = "example.com-unknown";
|
||||||
|
const result = buildFilenameWithExtension(filename, null);
|
||||||
|
expect(result).toBe("example.com-unknown");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("appends .docx for word document MIME type", () => {
|
||||||
|
const filename = "sharepoint.com-documents-report";
|
||||||
|
const result = buildFilenameWithExtension(
|
||||||
|
filename,
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
);
|
||||||
|
expect(result).toBe("sharepoint.com-documents-report.docx");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("handles content type with charset parameter correctly", () => {
|
||||||
|
const filename = "api.example.com-export-data";
|
||||||
|
const result = buildFilenameWithExtension(
|
||||||
|
filename,
|
||||||
|
"text/csv; charset=utf-8"
|
||||||
|
);
|
||||||
|
expect(result).toBe("api.example.com-export-data.csv");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -5,6 +5,16 @@ const { downloadURIToFile } = require("../../utils/downloadURIToFile");
|
|||||||
const { ACCEPTED_MIMES } = require("../../utils/constants");
|
const { ACCEPTED_MIMES } = require("../../utils/constants");
|
||||||
const { validYoutubeVideoUrl } = require("../../utils/url");
|
const { validYoutubeVideoUrl } = require("../../utils/url");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a Content-Type header value and return the MIME type without charset or other parameters.
|
||||||
|
* @param {string|null} contentTypeHeader - The raw Content-Type header value
|
||||||
|
* @returns {string|null} - The MIME type (e.g., "application/pdf") or null
|
||||||
|
*/
|
||||||
|
function parseContentType(contentTypeHeader) {
|
||||||
|
if (!contentTypeHeader) return null;
|
||||||
|
return contentTypeHeader.toLowerCase().split(";")[0].trim() || null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the content type of a resource
|
* Get the content type of a resource
|
||||||
* - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
|
* - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
|
||||||
@@ -34,8 +44,9 @@ async function getContentTypeFromURL(url) {
|
|||||||
contentType: null,
|
contentType: null,
|
||||||
};
|
};
|
||||||
|
|
||||||
const contentType = res.headers.get("Content-Type")?.toLowerCase();
|
const contentTypeWithoutCharset = parseContentType(
|
||||||
const contentTypeWithoutCharset = contentType?.split(";")[0].trim();
|
res.headers.get("Content-Type")
|
||||||
|
);
|
||||||
if (!contentTypeWithoutCharset)
|
if (!contentTypeWithoutCharset)
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
@@ -171,6 +182,7 @@ async function processAsFile({ uri, saveAsDocument = true }) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
parseContentType,
|
||||||
returnResult,
|
returnResult,
|
||||||
getContentTypeFromURL,
|
getContentTypeFromURL,
|
||||||
determineContentType,
|
determineContentType,
|
||||||
|
|||||||
@@ -1,10 +1,26 @@
|
|||||||
const { WATCH_DIRECTORY } = require("../constants");
|
const { WATCH_DIRECTORY, ACCEPTED_MIMES } = require("../constants");
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
const { pipeline } = require("stream/promises");
|
const { pipeline } = require("stream/promises");
|
||||||
const { validURL } = require("../url");
|
const { validURL } = require("../url");
|
||||||
const { default: slugify } = require("slugify");
|
const { default: slugify } = require("slugify");
|
||||||
|
|
||||||
|
// Add a custom slugify extension for slashing to handle URLs with paths.
|
||||||
|
slugify.extend({ "/": "-" });
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maps a MIME type to the preferred file extension using ACCEPTED_MIMES.
|
||||||
|
* Returns null if the MIME type is not recognized or if there are no possible extensions.
|
||||||
|
* @param {string} mimeType - The MIME type to resolve (e.g., "application/pdf")
|
||||||
|
* @returns {string|null} - The file extension (e.g., ".pdf") or null
|
||||||
|
*/
|
||||||
|
function mimeToExtension(mimeType) {
|
||||||
|
if (!mimeType || !ACCEPTED_MIMES.hasOwnProperty(mimeType)) return null;
|
||||||
|
const possibleExtensions = ACCEPTED_MIMES[mimeType] ?? [];
|
||||||
|
if (possibleExtensions.length === 0) return null;
|
||||||
|
return possibleExtensions[0];
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Download a file to the hotdir
|
* Download a file to the hotdir
|
||||||
* @param {string} url - The URL of the file to download
|
* @param {string} url - The URL of the file to download
|
||||||
@@ -33,10 +49,29 @@ async function downloadURIToFile(url, maxTimeout = 10_000) {
|
|||||||
.finally(() => clearTimeout(timeout));
|
.finally(() => clearTimeout(timeout));
|
||||||
|
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
const filename = `${urlObj.hostname}-${slugify(
|
const sluggedPath = slugify(urlObj.pathname, { lower: true });
|
||||||
urlObj.pathname.replace(/\//g, "-"),
|
let filename = `${urlObj.hostname}-${sluggedPath}`;
|
||||||
{ lower: true }
|
|
||||||
)}`;
|
const existingExt = path.extname(filename).toLowerCase();
|
||||||
|
const { SUPPORTED_FILETYPE_CONVERTERS } = require("../constants");
|
||||||
|
|
||||||
|
// If the filename does not already have a supported file extension,
|
||||||
|
// try to infer one from the response Content-Type header.
|
||||||
|
// This handles URLs like https://arxiv.org/pdf/2307.10265 where the
|
||||||
|
// path has no explicit extension but the server responds with
|
||||||
|
// Content-Type: application/pdf.
|
||||||
|
if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(existingExt)) {
|
||||||
|
const { parseContentType } = require("../../processLink/helpers");
|
||||||
|
const contentType = parseContentType(res.headers.get("Content-Type"));
|
||||||
|
const inferredExt = mimeToExtension(contentType);
|
||||||
|
if (inferredExt) {
|
||||||
|
console.log(
|
||||||
|
`[Collector] URL path has no recognized extension. Inferred ${inferredExt} from Content-Type: ${contentType}`
|
||||||
|
);
|
||||||
|
filename += inferredExt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const localFilePath = path.join(WATCH_DIRECTORY, filename);
|
const localFilePath = path.join(WATCH_DIRECTORY, filename);
|
||||||
const writeStream = fs.createWriteStream(localFilePath);
|
const writeStream = fs.createWriteStream(localFilePath);
|
||||||
await pipeline(res.body, writeStream);
|
await pipeline(res.body, writeStream);
|
||||||
@@ -51,4 +86,5 @@ async function downloadURIToFile(url, maxTimeout = 10_000) {
|
|||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
downloadURIToFile,
|
downloadURIToFile,
|
||||||
|
mimeToExtension,
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user