add jina embedding provider

2026-04-25 17:15:37 +02:00 · 2025-06-05 17:00:52 -07:00
parent ef0928993e
commit 2ef2419056
8 changed files with 292 additions and 7 deletions
--- a/frontend/src/components/EmbeddingSelection/JinaOptions/index.jsx
+++ b/frontend/src/components/EmbeddingSelection/JinaOptions/index.jsx
@@ -0,0 +1,104 @@
+import React, { useState } from "react";
+import { CaretDown, CaretUp } from "@phosphor-icons/react";
+
+export default function JinaOptions({ settings }) {
+  const [showAdvancedControls, setShowAdvancedControls] = useState(false);
+  return (
+    <div className="w-full flex flex-col gap-y-7">
+      <div className="w-full flex items-center gap-[36px] mt-1.5 flex-wrap">
+        <div className="flex flex-col w-60">
+          <label className="text-white text-sm font-semibold block mb-3">
+            API Key
+          </label>
+          <input
+            type="password"
+            name="JinaApiKey"
+            className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+            placeholder="Jina API Key"
+            defaultValue={settings?.JinaApiKey ? "*".repeat(20) : ""}
+            required={true}
+            autoComplete="off"
+            spellCheck={false}
+          />
+        </div>
+        <div className="flex flex-col w-60">
+          <label className="text-white text-sm font-semibold block mb-3">
+            Model Preference
+          </label>
+          <select
+            name="EmbeddingModelPref"
+            required={true}
+            defaultValue={settings?.EmbeddingModelPref}
+            className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+          >
+            <optgroup label="Available embedding models">
+              {[
+                "jina-embeddings-v3",
+                "jina-embeddings-v2-base-en",
+                "jina-embeddings-v2-base-zh",
+                "jina-embeddings-v2-base-de",
+                "jina-embeddings-v2-base-es",
+                "jina-embeddings-v2-base-code",
+                "jina-clip-v2",
+                "jina-clip-v1",
+              ].map((model) => {
+                return (
+                  <option key={model} value={model}>
+                    {model}
+                  </option>
+                );
+              })}
+            </optgroup>
+          </select>
+        </div>
+      </div>
+      <div className="flex items-center gap-x-3">
+        <button
+          type="button"
+          onClick={() => setShowAdvancedControls(!showAdvancedControls)}
+          className="flex items-center gap-x-2 text-white text-sm font-semibold"
+        >
+          Advanced Settings
+          {showAdvancedControls ? (
+            <CaretUp size={16} weight="bold" />
+          ) : (
+            <CaretDown size={16} weight="bold" />
+          )}
+        </button>
+      </div>
+      {showAdvancedControls && (
+        <div className="flex flex-col gap-y-4">
+          <div className="flex flex-col w-60">
+            <label className="text-white text-sm font-semibold block mb-3">
+              Task Type
+            </label>
+            <input
+              type="text"
+              name="JinaTask"
+              className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+              placeholder="e.g. retrieval.document"
+              defaultValue={settings?.JinaTask}
+              autoComplete="off"
+              spellCheck={false}
+            />
+          </div>
+          <div className="flex flex-col w-60">
+            <label className="text-white text-sm font-semibold block mb-3">
+              Max Chunk Length
+            </label>
+            <input
+              type="number"
+              name="EmbeddingModelMaxChunkLength"
+              className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+              placeholder="8192"
+              defaultValue={settings?.EmbeddingModelMaxChunkLength || 8192}
+              required={true}
+              autoComplete="off"
+              spellCheck={false}
+            />
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
--- a/frontend/src/media/embeddingprovider/jina.png
+++ b/frontend/src/media/embeddingprovider/jina.png
--- a/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx
+++ b/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx
@@ -15,6 +15,7 @@ import VoyageAiLogo from "@/media/embeddingprovider/voyageai.png";
 import LiteLLMLogo from "@/media/llmprovider/litellm.png";
 import GenericOpenAiLogo from "@/media/llmprovider/generic-openai.png";
 import MistralAiLogo from "@/media/llmprovider/mistral.jpeg";
+import JinaAiLogo from "@/media/embeddingprovider/jina.png";

 import PreLoader from "@/components/Preloader";
 import ChangeWarningModal from "@/components/ChangeWarning";
@@ -29,6 +30,7 @@ import CohereEmbeddingOptions from "@/components/EmbeddingSelection/CohereOption
 import VoyageAiOptions from "@/components/EmbeddingSelection/VoyageAiOptions";
 import LiteLLMOptions from "@/components/EmbeddingSelection/LiteLLMOptions";
 import GenericOpenAiEmbeddingOptions from "@/components/EmbeddingSelection/GenericOpenAiOptions";
+import JinaOptions from "@/components/EmbeddingSelection/JinaOptions";

 import EmbedderItem from "@/components/EmbeddingSelection/EmbedderItem";
 import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react";
@@ -127,6 +129,13 @@ const EMBEDDERS = [
    ),
    description: "Run embedding models from any OpenAI compatible API service.",
  },
+  {
+    name: "Jina AI",
+    value: "jina",
+    logo: JinaAiLogo,
+    options: (settings) => <JinaOptions settings={settings} />,
+    description: "Run powerful multilingual embedding models from Jina AI.",
+  },
 ];

 export default function GeneralEmbeddingPreference() {
--- a/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx
+++ b/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx
@@ -38,6 +38,7 @@ import VoyageAiLogo from "@/media/embeddingprovider/voyageai.png";
 import PPIOLogo from "@/media/llmprovider/ppio.png";
 import PGVectorLogo from "@/media/vectordbs/pgvector.png";
 import DPAISLogo from "@/media/llmprovider/dpais.png";
+import JinaAiLogo from "@/media/embeddingprovider/jina.png";
 import React, { useState, useEffect } from "react";
 import paths from "@/utils/paths";
 import { useNavigate } from "react-router-dom";
@@ -187,6 +188,14 @@ export const LLM_SELECTION_PRIVACY = {
    ],
    logo: GenericOpenAiLogo,
  },
+  jina: {
+    name: "Jina AI",
+    description: [
+      "Your document text is sent to Jina AI's servers for processing",
+      "Your data is handled according to Jina AI's terms of service and privacy policy",
+    ],
+    logo: JinaAiLogo,
+  },
  cohere: {
    name: "Cohere",
    description: [
@@ -393,6 +402,14 @@ export const EMBEDDING_ENGINE_PRIVACY = {
    ],
    logo: GenericOpenAiLogo,
  },
+  jina: {
+    name: "Jina AI",
+    description: [
+      "Your document text is sent to Jina AI's servers for processing",
+      "Your data is handled according to Jina AI's terms of service and privacy policy",
+    ],
+    logo: JinaAiLogo,
+  },
  gemini: {
    name: "Google Gemini",
    description: [
@@ -493,8 +510,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {
            </p>
          </div>
          <ul className="flex flex-col list-disc ml-4">
-            {LLMSelection.description.map((desc) => (
-              <li className="text-theme-text-primary text-sm">{desc}</li>
+            {LLMSelection.description.map((desc, index) => (
+              <li key={index} className="text-theme-text-secondary text-sm">
+                {desc}
+              </li>
            ))}
          </ul>
        </div>
@@ -513,8 +532,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {
            </p>
          </div>
          <ul className="flex flex-col list-disc ml-4">
-            {EmbeddingEngine.description.map((desc) => (
-              <li className="text-theme-text-primary text-sm">{desc}</li>
+            {EmbeddingEngine.description.map((desc, index) => (
+              <li key={index} className="text-theme-text-secondary text-sm">
+                {desc}
+              </li>
            ))}
          </ul>
        </div>
@@ -534,8 +555,10 @@ export default function DataHandling({ setHeader, setForwardBtn, setBackBtn }) {
            </p>
          </div>
          <ul className="flex flex-col list-disc ml-4">
-            {VectorDb.description.map((desc) => (
-              <li className="text-theme-text-primary text-sm">{desc}</li>
+            {VectorDb.description.map((desc, index) => (
+              <li key={index} className="text-theme-text-secondary text-sm">
+                {desc}
+              </li>
            ))}
          </ul>
        </div>
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@@ -211,6 +211,8 @@ const SystemSettings = {
      EmbeddingModelMaxChunkLength:
        process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
      VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
+      JinaApiKey: !!process.env.JINA_API_KEY,
+      JinaTask: process.env.JINA_TASK,
      GenericOpenAiEmbeddingApiKey:
        !!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY,
      GenericOpenAiEmbeddingMaxConcurrentChunks:
@@ -336,7 +338,7 @@ const SystemSettings = {
      const updatePromises = [];
      for (const key of Object.keys(updates)) {
        let validatedValue = updates[key];
-        if (this.validations.hasOwnProperty(key)) {
+        if (Object.prototype.hasOwnProperty.call(this.validations, key)) {
          if (this.validations[key].constructor.name === "AsyncFunction") {
            validatedValue = await this.validations[key](updates[key]);
          } else {
--- a/server/utils/EmbeddingEngines/jina/index.js
+++ b/server/utils/EmbeddingEngines/jina/index.js
@@ -0,0 +1,133 @@
+const { toChunks, maximumChunkLength } = require("../../helpers");
+
+class JinaEmbedder {
+  constructor() {
+    this.basePath = "https://api.jina.ai/v1";
+    this.apiKey = process.env.JINA_API_KEY ?? null;
+    this.model = process.env.EMBEDDING_MODEL_PREF ?? "jina-embeddings-v3";
+    this.task = process.env.JINA_TASK ?? null;
+    this.embeddingMaxChunkLength = maximumChunkLength();
+
+    // this.maxConcurrentChunks is delegated to the getter below.
+    // Refer to your specific model and provider you use this class with to determine a valid maxChunkLength
+    this.log(`Initialized ${this.model}`, {
+      baseURL: this.basePath,
+      maxConcurrentChunks: this.maxConcurrentChunks,
+      embeddingMaxChunkLength: this.embeddingMaxChunkLength,
+    });
+  }
+
+  log(text, ...args) {
+    console.log(`\x1b[36m[JinaEmbedder]\x1b[0m ${text}`, ...args);
+  }
+
+  /**
+   * returns the `GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS` env variable as a number
+   * or 500 if the env variable is not set or is not a number.
+   * @returns {number}
+   */
+  get maxConcurrentChunks() {
+    if (!process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS)
+      return 500;
+    if (
+      isNaN(Number(process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS))
+    )
+      return 500;
+    return Number(process.env.GENERIC_OPEN_AI_EMBEDDING_MAX_CONCURRENT_CHUNKS);
+  }
+
+  async embedTextInput(textInput) {
+    const result = await this.embedChunks(
+      Array.isArray(textInput) ? textInput : [textInput]
+    );
+    return result?.[0] || [];
+  }
+
+  async embedChunks(textChunks = []) {
+    // Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb)
+    // we concurrently execute each max batch of text chunks possible.
+    // Refer to constructor maxConcurrentChunks for more info.
+    const embeddingRequests = [];
+    for (const chunk of toChunks(textChunks, this.maxConcurrentChunks)) {
+      embeddingRequests.push(
+        new Promise((resolve) => {
+          (async () => {
+            // We are using a fetch request here because the current openai library
+            // does not support the Jina API
+            try {
+              const response = await fetch(`${this.basePath}/embeddings`, {
+                method: "POST",
+                headers: {
+                  "Content-Type": "application/json",
+                  Authorization: `Bearer ${this.apiKey}`,
+                },
+                body: JSON.stringify({
+                  model: this.model,
+                  input: chunk,
+                  ...(this.task ? { task: this.task } : {}),
+                }),
+              });
+
+              if (!response.ok) {
+                const error = await response.json();
+                throw {
+                  type: error?.error?.code || response.status,
+                  message: error?.error?.message || response.statusText,
+                };
+              }
+
+              const result = await response.json();
+              resolve({ data: result?.data, error: null });
+            } catch (e) {
+              resolve({
+                data: [],
+                error: {
+                  type: e?.type || "failed_to_embed",
+                  message: e?.message || "Failed to embed text",
+                },
+              });
+            }
+          })();
+        })
+      );
+    }
+
+    const { data = [], error = null } = await Promise.all(
+      embeddingRequests
+    ).then((results) => {
+      // If any errors were returned from OpenAI abort the entire sequence because the embeddings
+      // will be incomplete.
+      const errors = results
+        .filter((res) => !!res.error)
+        .map((res) => res.error)
+        .flat();
+      if (errors.length > 0) {
+        let uniqueErrors = new Set();
+        errors.map((error) =>
+          uniqueErrors.add(`[${error.type}]: ${error.message}`)
+        );
+
+        return {
+          data: [],
+          error: Array.from(uniqueErrors).join(", "),
+        };
+      }
+      return {
+        data: results.map((res) => res?.data || []).flat(),
+        error: null,
+      };
+    });
+
+    if (!!error) throw new Error(`Jina Failed to embed: ${error}`);
+    return data.length > 0 &&
+      data.every((embd) =>
+        Object.prototype.hasOwnProperty.call(embd, "embedding")
+      )
+      ? data.map((embd) => embd.embedding)
+      : null;
+  }
+}
+
+module.exports = {
+  JinaEmbedder,
+};
--- a/server/utils/helpers/index.js
+++ b/server/utils/helpers/index.js
@@ -260,6 +260,9 @@ function getEmbeddingEngineSelection() {
    case "gemini":
      const { GeminiEmbedder } = require("../EmbeddingEngines/gemini");
      return new GeminiEmbedder();
+    case "jina":
+      const { JinaEmbedder } = require("../EmbeddingEngines/jina");
+      return new JinaEmbedder();
    default:
      return new NativeEmbedder();
  }
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@@ -301,6 +301,16 @@ const KEY_MAPPING = {
    checks: [isNotEmpty],
  },

+  // Jina Embedding Settings
+  JinaApiKey: {
+    envKey: "JINA_API_KEY",
+    checks: [isNotEmpty],
+  },
+  JinaTask: {
+    envKey: "JINA_TASK",
+    checks: [],
+  },
+
  // Generic OpenAI Embedding Settings
  GenericOpenAiEmbeddingApiKey: {
    envKey: "GENERIC_OPEN_AI_EMBEDDING_API_KEY",
@@ -817,6 +827,7 @@ function supportedEmbeddingModel(input = "") {
    "litellm",
    "generic-openai",
    "mistral",
+    "jina",
  ];
  return supported.includes(input)
    ? null