From 7a0c149d2e43b9ea0a44cd76a5b568e5a8fad1a7 Mon Sep 17 00:00:00 2001
From: jonathanortega2023
 <34843188+jonathanortega2023@users.noreply.github.com>
Date: Thu, 20 Nov 2025 15:02:47 -0600
Subject: [PATCH] fix: Use eval_duration for output TPS calculations in Ollama
 LLM provider (#4568)

* fix: Use eval_duration for output TPS calculations and add as a metric field

* refactor usage of eval_duration from ollama metrics

* move eval_duration to usage

* overwrite duration in ollama provider wip measureAsyncFunction optional param

* allow for overloaded duration in measureAsyncFunction

* simplify flow for duration tracking

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
---
 server/utils/AiProviders/ollama/index.js           |  7 +++++--
 server/utils/helpers/chat/LLMPerformanceMonitor.js | 12 ++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/server/utils/AiProviders/ollama/index.js b/server/utils/AiProviders/ollama/index.js
index b88a81218..c7614d717 100644
--- a/server/utils/AiProviders/ollama/index.js
+++ b/server/utils/AiProviders/ollama/index.js
@@ -263,6 +263,7 @@ class OllamaAILLM {
               prompt_tokens: res.prompt_eval_count,
               completion_tokens: res.eval_count,
               total_tokens: res.prompt_eval_count + res.eval_count,
+              duration: res.eval_duration / 1e9,
             },
           };
         })
@@ -282,8 +283,9 @@ class OllamaAILLM {
         prompt_tokens: result.output.usage.prompt_tokens,
         completion_tokens: result.output.usage.completion_tokens,
         total_tokens: result.output.usage.total_tokens,
-        outputTps: result.output.usage.completion_tokens / result.duration,
-        duration: result.duration,
+        outputTps:
+          result.output.usage.completion_tokens / result.output.usage.duration,
+        duration: result.output.usage.duration,
       },
     };
   }
@@ -349,6 +351,7 @@ class OllamaAILLM {
           if (chunk.done) {
             usage.prompt_tokens = chunk.prompt_eval_count;
             usage.completion_tokens = chunk.eval_count;
+            usage.duration = chunk.eval_duration / 1e9;
             writeResponseChunk(response, {
               uuid,
               sources,
diff --git a/server/utils/helpers/chat/LLMPerformanceMonitor.js b/server/utils/helpers/chat/LLMPerformanceMonitor.js
index bd02863ed..070df5907 100644
--- a/server/utils/helpers/chat/LLMPerformanceMonitor.js
+++ b/server/utils/helpers/chat/LLMPerformanceMonitor.js
@@ -39,6 +39,8 @@ class LLMPerformanceMonitor {
   }
   /**
    * Wraps a function and logs the duration (in seconds) of the function call.
+   * If the output contains a `usage.duration` property, it will be used instead of the calculated duration.
+   * This allows providers to supply more accurate timing information.
    * @param {Function} func
    * @returns {Promise<{output: any, duration: number}>}
    */
@@ -47,7 +49,8 @@ class LLMPerformanceMonitor {
       const start = Date.now();
       const output = await func; // is a promise
       const end = Date.now();
-      return { output, duration: (end - start) / 1000 };
+      const duration = output?.usage?.duration ?? (end - start) / 1000;
+      return { output, duration };
     })();
   }
 
@@ -77,19 +80,20 @@ class LLMPerformanceMonitor {
 
     stream.endMeasurement = (reportedUsage = {}) => {
       const end = Date.now();
-      const duration = (end - stream.start) / 1000;
+      const estimatedDuration = (end - stream.start) / 1000;
 
       // Merge the reported usage with the existing metrics
       // so the math in the metrics object is correct when calculating
       stream.metrics = {
         ...stream.metrics,
         ...reportedUsage,
+        duration: reportedUsage?.duration ?? estimatedDuration,
       };
 
       stream.metrics.total_tokens =
         stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0);
-      stream.metrics.outputTps = stream.metrics.completion_tokens / duration;
-      stream.metrics.duration = duration;
+      stream.metrics.outputTps =
+        stream.metrics.completion_tokens / stream.metrics.duration;
       return stream.metrics;
     };
     return stream;