diff --git a/collector/__tests__/utils/downloadURIToFile/index.test.js b/collector/__tests__/utils/downloadURIToFile/index.test.js new file mode 100644 index 000000000..167afb923 --- /dev/null +++ b/collector/__tests__/utils/downloadURIToFile/index.test.js @@ -0,0 +1,96 @@ +const path = require("path"); +const { SUPPORTED_FILETYPE_CONVERTERS } = require("../../../utils/constants"); +const { mimeToExtension } = require("../../../utils/downloadURIToFile"); + +/** + * Simulates the filename-building logic from downloadURIToFile + * to verify extension inference works correctly. + */ +function buildFilenameWithExtension(sluggedFilename, contentType) { + const existingExt = path.extname(sluggedFilename).toLowerCase(); + if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(existingExt)) { + const mimeType = contentType?.toLowerCase()?.split(";")[0]?.trim(); + const inferredExt = mimeToExtension(mimeType); + if (inferredExt) { + return sluggedFilename + inferredExt; + } + } + return sluggedFilename; +} + +describe("mimeToExtension", () => { + test("returns null for invalid or unknown input", () => { + expect(mimeToExtension(null)).toBeNull(); + expect(mimeToExtension(undefined)).toBeNull(); + expect(mimeToExtension("application/octet-stream")).toBeNull(); + }); + + test("returns first extension from ACCEPTED_MIMES for known types", () => { + expect(mimeToExtension("application/pdf")).toBe(".pdf"); + }); +}); + +describe("buildFilenameWithExtension", () => { + test("appends .pdf when URL path has no recognized extension (arxiv case)", () => { + // Simulates: https://arxiv.org/pdf/2307.10265 + // slugify produces something like "arxiv.org-pdf-230710265" + const filename = "arxiv.org-pdf-230710265"; + const result = buildFilenameWithExtension(filename, "application/pdf"); + expect(result).toBe("arxiv.org-pdf-230710265.pdf"); + }); + + test("appends .pdf when URL has numeric-looking extension", () => { + // path.extname("arxiv.org-pdf-2307.10265") => ".10265" which is not in SUPPORTED_FILETYPE_CONVERTERS + const filename = "arxiv.org-pdf-2307.10265"; + const result = buildFilenameWithExtension( + filename, + "application/pdf; charset=utf-8" + ); + expect(result).toBe("arxiv.org-pdf-2307.10265.pdf"); + }); + + test("does NOT append extension when file already has a supported extension", () => { + const filename = "example.com-document.pdf"; + const result = buildFilenameWithExtension(filename, "application/pdf"); + expect(result).toBe("example.com-document.pdf"); + }); + + test("does NOT append extension when file has .txt extension", () => { + const filename = "example.com-readme.txt"; + const result = buildFilenameWithExtension(filename, "text/plain"); + expect(result).toBe("example.com-readme.txt"); + }); + + test("does not append extension for unknown content type", () => { + const filename = "example.com-binary-blob"; + const result = buildFilenameWithExtension( + filename, + "application/octet-stream" + ); + expect(result).toBe("example.com-binary-blob"); + }); + + test("does not append extension when content type is null", () => { + const filename = "example.com-unknown"; + const result = buildFilenameWithExtension(filename, null); + expect(result).toBe("example.com-unknown"); + }); + + test("appends .docx for word document MIME type", () => { + const filename = "sharepoint.com-documents-report"; + const result = buildFilenameWithExtension( + filename, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ); + expect(result).toBe("sharepoint.com-documents-report.docx"); + }); + + test("handles content type with charset parameter correctly", () => { + const filename = "api.example.com-export-data"; + const result = buildFilenameWithExtension( + filename, + "text/csv; charset=utf-8" + ); + expect(result).toBe("api.example.com-export-data.csv"); + }); +}); diff --git a/collector/processLink/helpers/index.js b/collector/processLink/helpers/index.js index 88b74b2c2..dc00f8f4d 100644 --- a/collector/processLink/helpers/index.js +++ b/collector/processLink/helpers/index.js @@ -5,6 +5,16 @@ const { downloadURIToFile } = require("../../utils/downloadURIToFile"); const { ACCEPTED_MIMES } = require("../../utils/constants"); const { validYoutubeVideoUrl } = require("../../utils/url"); +/** + * Parse a Content-Type header value and return the MIME type without charset or other parameters. + * @param {string|null} contentTypeHeader - The raw Content-Type header value + * @returns {string|null} - The MIME type (e.g., "application/pdf") or null + */ +function parseContentType(contentTypeHeader) { + if (!contentTypeHeader) return null; + return contentTypeHeader.toLowerCase().split(";")[0].trim() || null; +} + /** * Get the content type of a resource * - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout @@ -34,8 +44,9 @@ async function getContentTypeFromURL(url) { contentType: null, }; - const contentType = res.headers.get("Content-Type")?.toLowerCase(); - const contentTypeWithoutCharset = contentType?.split(";")[0].trim(); + const contentTypeWithoutCharset = parseContentType( + res.headers.get("Content-Type") + ); if (!contentTypeWithoutCharset) return { success: false, @@ -171,6 +182,7 @@ async function processAsFile({ uri, saveAsDocument = true }) { } module.exports = { + parseContentType, returnResult, getContentTypeFromURL, determineContentType, diff --git a/collector/utils/downloadURIToFile/index.js b/collector/utils/downloadURIToFile/index.js index f7326658e..ce912648a 100644 --- a/collector/utils/downloadURIToFile/index.js +++ b/collector/utils/downloadURIToFile/index.js @@ -1,10 +1,26 @@ -const { WATCH_DIRECTORY } = require("../constants"); +const { WATCH_DIRECTORY, ACCEPTED_MIMES } = require("../constants"); const fs = require("fs"); const path = require("path"); const { pipeline } = require("stream/promises"); const { validURL } = require("../url"); const { default: slugify } = require("slugify"); +// Add a custom slugify extension for slashing to handle URLs with paths. +slugify.extend({ "/": "-" }); + +/** + * Maps a MIME type to the preferred file extension using ACCEPTED_MIMES. + * Returns null if the MIME type is not recognized or if there are no possible extensions. + * @param {string} mimeType - The MIME type to resolve (e.g., "application/pdf") + * @returns {string|null} - The file extension (e.g., ".pdf") or null + */ +function mimeToExtension(mimeType) { + if (!mimeType || !ACCEPTED_MIMES.hasOwnProperty(mimeType)) return null; + const possibleExtensions = ACCEPTED_MIMES[mimeType] ?? []; + if (possibleExtensions.length === 0) return null; + return possibleExtensions[0]; +} + /** * Download a file to the hotdir * @param {string} url - The URL of the file to download @@ -33,10 +49,29 @@ async function downloadURIToFile(url, maxTimeout = 10_000) { .finally(() => clearTimeout(timeout)); const urlObj = new URL(url); - const filename = `${urlObj.hostname}-${slugify( - urlObj.pathname.replace(/\//g, "-"), - { lower: true } - )}`; + const sluggedPath = slugify(urlObj.pathname, { lower: true }); + let filename = `${urlObj.hostname}-${sluggedPath}`; + + const existingExt = path.extname(filename).toLowerCase(); + const { SUPPORTED_FILETYPE_CONVERTERS } = require("../constants"); + + // If the filename does not already have a supported file extension, + // try to infer one from the response Content-Type header. + // This handles URLs like https://arxiv.org/pdf/2307.10265 where the + // path has no explicit extension but the server responds with + // Content-Type: application/pdf. + if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(existingExt)) { + const { parseContentType } = require("../../processLink/helpers"); + const contentType = parseContentType(res.headers.get("Content-Type")); + const inferredExt = mimeToExtension(contentType); + if (inferredExt) { + console.log( + `[Collector] URL path has no recognized extension. Inferred ${inferredExt} from Content-Type: ${contentType}` + ); + filename += inferredExt; + } + } + const localFilePath = path.join(WATCH_DIRECTORY, filename); const writeStream = fs.createWriteStream(localFilePath); await pipeline(res.body, writeStream); @@ -51,4 +86,5 @@ async function downloadURIToFile(url, maxTimeout = 10_000) { module.exports = { downloadURIToFile, + mimeToExtension, };