From 7a0c149d2e43b9ea0a44cd76a5b568e5a8fad1a7 Mon Sep 17 00:00:00 2001 From: jonathanortega2023 <34843188+jonathanortega2023@users.noreply.github.com> Date: Thu, 20 Nov 2025 15:02:47 -0600 Subject: [PATCH] fix: Use eval_duration for output TPS calculations in Ollama LLM provider (#4568) * fix: Use eval_duration for output TPS calculations and add as a metric field * refactor usage of eval_duration from ollama metrics * move eval_duration to usage * overwrite duration in ollama provider wip measureAsyncFunction optional param * allow for overloaded duration in measureAsyncFunction * simplify flow for duration tracking --------- Co-authored-by: shatfield4 Co-authored-by: Timothy Carambat --- server/utils/AiProviders/ollama/index.js | 7 +++++-- server/utils/helpers/chat/LLMPerformanceMonitor.js | 12 ++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/server/utils/AiProviders/ollama/index.js b/server/utils/AiProviders/ollama/index.js index b88a81218..c7614d717 100644 --- a/server/utils/AiProviders/ollama/index.js +++ b/server/utils/AiProviders/ollama/index.js @@ -263,6 +263,7 @@ class OllamaAILLM { prompt_tokens: res.prompt_eval_count, completion_tokens: res.eval_count, total_tokens: res.prompt_eval_count + res.eval_count, + duration: res.eval_duration / 1e9, }, }; }) @@ -282,8 +283,9 @@ class OllamaAILLM { prompt_tokens: result.output.usage.prompt_tokens, completion_tokens: result.output.usage.completion_tokens, total_tokens: result.output.usage.total_tokens, - outputTps: result.output.usage.completion_tokens / result.duration, - duration: result.duration, + outputTps: + result.output.usage.completion_tokens / result.output.usage.duration, + duration: result.output.usage.duration, }, }; } @@ -349,6 +351,7 @@ class OllamaAILLM { if (chunk.done) { usage.prompt_tokens = chunk.prompt_eval_count; usage.completion_tokens = chunk.eval_count; + usage.duration = chunk.eval_duration / 1e9; writeResponseChunk(response, { uuid, sources, diff --git a/server/utils/helpers/chat/LLMPerformanceMonitor.js b/server/utils/helpers/chat/LLMPerformanceMonitor.js index bd02863ed..070df5907 100644 --- a/server/utils/helpers/chat/LLMPerformanceMonitor.js +++ b/server/utils/helpers/chat/LLMPerformanceMonitor.js @@ -39,6 +39,8 @@ class LLMPerformanceMonitor { } /** * Wraps a function and logs the duration (in seconds) of the function call. + * If the output contains a `usage.duration` property, it will be used instead of the calculated duration. + * This allows providers to supply more accurate timing information. * @param {Function} func * @returns {Promise<{output: any, duration: number}>} */ @@ -47,7 +49,8 @@ class LLMPerformanceMonitor { const start = Date.now(); const output = await func; // is a promise const end = Date.now(); - return { output, duration: (end - start) / 1000 }; + const duration = output?.usage?.duration ?? (end - start) / 1000; + return { output, duration }; })(); } @@ -77,19 +80,20 @@ class LLMPerformanceMonitor { stream.endMeasurement = (reportedUsage = {}) => { const end = Date.now(); - const duration = (end - stream.start) / 1000; + const estimatedDuration = (end - stream.start) / 1000; // Merge the reported usage with the existing metrics // so the math in the metrics object is correct when calculating stream.metrics = { ...stream.metrics, ...reportedUsage, + duration: reportedUsage?.duration ?? estimatedDuration, }; stream.metrics.total_tokens = stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0); - stream.metrics.outputTps = stream.metrics.completion_tokens / duration; - stream.metrics.duration = duration; + stream.metrics.outputTps = + stream.metrics.completion_tokens / stream.metrics.duration; return stream.metrics; }; return stream;