anything-llm/collector/processLink/helpers/index.js

const path = require("path");
const { validURL } = require("../../utils/url");
const { processSingleFile } = require("../../processSingleFile");
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
const { ACCEPTED_MIMES } = require("../../utils/constants");
const { validYoutubeVideoUrl } = require("../../utils/url");

/**
 * Parse a Content-Type header value and return the MIME type without charset or other parameters.
 * @param {string|null} contentTypeHeader - The raw Content-Type header value
 * @returns {string|null} - The MIME type (e.g., "application/pdf") or null
 */
function parseContentType(contentTypeHeader) {
  if (!contentTypeHeader) return null;
  return contentTypeHeader.toLowerCase().split(";")[0].trim() || null;
}

/**
 * Get the content type of a resource
 * - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
 * @param {string} url - The URL to get the content type of
 * @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
 */
async function getContentTypeFromURL(url) {
  try {
    if (!url || typeof url !== "string" || !validURL(url))
      return { success: false, reason: "Not a valid URL.", contentType: null };

    const abortController = new AbortController();
    const timeout = setTimeout(() => {
      abortController.abort();
      console.error("Timeout fetching content type for URL:", url.toString());
    }, 5_000);

    const res = await fetch(url, {
      method: "HEAD",
      signal: abortController.signal,
    }).finally(() => clearTimeout(timeout));

    if (!res.ok)
      return {
        success: false,
        reason: `HTTP ${res.status}: ${res.statusText}`,
        contentType: null,
      };

    const contentTypeWithoutCharset = parseContentType(
      res.headers.get("Content-Type")
    );
    if (!contentTypeWithoutCharset)
      return {
        success: false,
        reason: "No Content-Type found.",
        contentType: null,
      };
    return {
      success: true,
      reason: null,
      contentType: contentTypeWithoutCharset,
    };
  } catch (error) {
    return {
      success: false,
      reason: `Error: ${error.message}`,
      contentType: null,
    };
  }
}

/**
 * Normalize the result object based on the saveAsDocument flag
 * @param {Object} result - The result object to normalize
 * @param {boolean} result.success - Whether the result is successful
 * @param {string|null} result.reason - The reason for the result
 * @param {Object[]} result.documents - The documents from the result
 * @param {string|null} result.content - The content of the result
 * @param {boolean} result.saveAsDocument - Whether to save the content as a document. Default is true
 * @returns {{success: boolean, reason: string|null, documents: Object[], content: string|null}} - The normalized result object
 */
function returnResult({
  success,
  reason,
  documents,
  content,
  saveAsDocument = true,
} = {}) {
  if (!saveAsDocument) {
    return {
      success,
      content,
    };
  } else return { success, reason, documents };
}

/**
 * Determine the content type of a link - should be a URL
 * @param {string} uri - The link to determine the content type of
 * @returns {Promise<{contentType: string|null, processVia: 'web' | 'file' | 'youtube'}>} - The content type of the link
 */
async function determineContentType(uri) {
  let processVia = "web";

  // Dont check for content type if it is a YouTube video URL
  if (validYoutubeVideoUrl(uri))
    return { contentType: "text/html", processVia: "youtube" };

  return await getContentTypeFromURL(uri)
    .then((result) => {
      if (!!result.reason) console.error(result.reason);

      // If the content type is not text/html or text/plain, and it is in the ACCEPTED_MIMES,
      // then we can process it as a file
      if (
        !!result.contentType &&
        !["text/html", "text/plain"].includes(result.contentType) &&
        result.contentType in ACCEPTED_MIMES
      )
        processVia = "file";

      return { contentType: result.contentType, processVia };
    })
    .catch((error) => {
      console.error("Error getting content type from URL", error);
      return { contentType: null, processVia };
    });
}

/**
 * Process a link as a file
 * @param {string} uri - The link to process as a file
 * @param {boolean} saveAsDocument - Whether to save the content as a document. Default is true
 * @returns {Promise<{success: boolean, reason: string|null, documents: Object[], content: string|null, saveAsDocument: boolean}>} - The content of the file
 */
async function processAsFile({ uri, saveAsDocument = true }) {
  const fileContentResult = await downloadURIToFile(uri);
  if (!fileContentResult.success)
    return returnResult({
      success: false,
      reason: fileContentResult.reason,
      documents: [],
      content: null,
      saveAsDocument,
    });

  const fileFilePath = fileContentResult.fileLocation;
  const targetFilename = path.basename(fileFilePath);

  /**
   * If the saveAsDocument is false, we are only interested in the text content
   * and can ignore the file as a document by using `parseOnly` in the options.
   * This will send the file to the Direct Uploads folder instead of the Documents folder.
   * that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
   * is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
   *
   * TODO: Improve this process via a new option that will instantly delete the file after processing
   * if we find we dont need this file ever after processing.
   */
  const processSingleFileResult = await processSingleFile(targetFilename, {
    parseOnly: saveAsDocument === false,
  });
  if (!processSingleFileResult.success) {
    return returnResult({
      success: false,
      reason: processSingleFileResult.reason,
      documents: [],
      content: null,
      saveAsDocument,
    });
  }

  // If we intend to return only the text content, return the content from the file
  // and then delete the file - otherwise it will be saved as a document
  if (!saveAsDocument) {
    return returnResult({
      success: true,
      content: processSingleFileResult.documents[0].pageContent,
      saveAsDocument,
    });
  }

  return processSingleFileResult;
}

module.exports = {
  parseContentType,
  returnResult,
  getContentTypeFromURL,
  determineContentType,
  processAsFile,
};