fix: Use eval_duration for output TPS calculations in Ollama LLM provider (#4568)

* fix: Use eval_duration for output TPS calculations and add as a metric field * refactor usage of eval_duration from ollama metrics * move eval_duration to usage * overwrite duration in ollama provider wip measureAsyncFunction optional param * allow for overloaded duration in measureAsyncFunction * simplify flow for duration tracking --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-04-25 17:15:37 +02:00 · 2025-11-20 15:02:47 -06:00
parent cf76bad452
commit 7a0c149d2e
2 changed files with 13 additions and 6 deletions
--- a/server/utils/AiProviders/ollama/index.js
+++ b/server/utils/AiProviders/ollama/index.js
@@ -263,6 +263,7 @@ class OllamaAILLM {
              prompt_tokens: res.prompt_eval_count,
              completion_tokens: res.eval_count,
              total_tokens: res.prompt_eval_count + res.eval_count,
+              duration: res.eval_duration / 1e9,
            },
          };
        })
@@ -282,8 +283,9 @@ class OllamaAILLM {
        prompt_tokens: result.output.usage.prompt_tokens,
        completion_tokens: result.output.usage.completion_tokens,
        total_tokens: result.output.usage.total_tokens,
-        outputTps: result.output.usage.completion_tokens / result.duration,
-        duration: result.duration,
+        outputTps:
+          result.output.usage.completion_tokens / result.output.usage.duration,
+        duration: result.output.usage.duration,
      },
    };
  }
@@ -349,6 +351,7 @@ class OllamaAILLM {
          if (chunk.done) {
            usage.prompt_tokens = chunk.prompt_eval_count;
            usage.completion_tokens = chunk.eval_count;
+            usage.duration = chunk.eval_duration / 1e9;
            writeResponseChunk(response, {
              uuid,
              sources,
--- a/server/utils/helpers/chat/LLMPerformanceMonitor.js
+++ b/server/utils/helpers/chat/LLMPerformanceMonitor.js
@@ -39,6 +39,8 @@ class LLMPerformanceMonitor {
  }
  /**
   * Wraps a function and logs the duration (in seconds) of the function call.
+   * If the output contains a `usage.duration` property, it will be used instead of the calculated duration.
+   * This allows providers to supply more accurate timing information.
   * @param {Function} func
   * @returns {Promise<{output: any, duration: number}>}
   */
@@ -47,7 +49,8 @@ class LLMPerformanceMonitor {
      const start = Date.now();
      const output = await func; // is a promise
      const end = Date.now();
-      return { output, duration: (end - start) / 1000 };
+      const duration = output?.usage?.duration ?? (end - start) / 1000;
+      return { output, duration };
    })();
  }

@@ -77,19 +80,20 @@ class LLMPerformanceMonitor {

    stream.endMeasurement = (reportedUsage = {}) => {
      const end = Date.now();
-      const duration = (end - stream.start) / 1000;
+      const estimatedDuration = (end - stream.start) / 1000;

      // Merge the reported usage with the existing metrics
      // so the math in the metrics object is correct when calculating
      stream.metrics = {
        ...stream.metrics,
        ...reportedUsage,
+        duration: reportedUsage?.duration ?? estimatedDuration,
      };

      stream.metrics.total_tokens =
        stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0);
-      stream.metrics.outputTps = stream.metrics.completion_tokens / duration;
-      stream.metrics.duration = duration;
+      stream.metrics.outputTps =
+        stream.metrics.completion_tokens / stream.metrics.duration;
      return stream.metrics;
    };
    return stream;