From 1ccf46815809d0765cc35e9696dc983fb4e388db Mon Sep 17 00:00:00 2001
From: Marcello Fitton <106866560+angelplusultra@users.noreply.github.com>
Date: Thu, 12 Feb 2026 14:40:35 -0800
Subject: [PATCH] fix: correct TPS calculation for Generic OpenAI provider with
 llama.cpp (#4981)

* add check for timings field on final chunk to override usage data

* refactor: extract llama.cpp timings into reusable private method

Move timings extraction into #extractTimings so it can be shared
by both streaming (handleStream) and non-streaming (getChatCompletion)
code paths.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* lint and cleanup

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
---
 .../utils/AiProviders/genericOpenAi/index.js  | 35 +++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/server/utils/AiProviders/genericOpenAi/index.js b/server/utils/AiProviders/genericOpenAi/index.js
index 7572a535b..1451b7e88 100644
--- a/server/utils/AiProviders/genericOpenAi/index.js
+++ b/server/utils/AiProviders/genericOpenAi/index.js
@@ -149,6 +149,23 @@ class GenericOpenAiLLM {
     ];
   }
 
+  /**
+   * Extracts accurate generation-only timing and token count from a llama.cpp
+   * response or streaming chunk. Mutates the provided usage object in place
+   * so it can be used by both streaming and non-streaming code paths.
+   * @param {Object} response - the API response or final streaming chunk
+   * @param {Object} usage - the usage object to mutate
+   */
+  #extractLlamaCppTimings(response, usage) {
+    if (!response || !response.timings) return;
+
+    if (response.timings.hasOwnProperty("predicted_n"))
+      usage.completion_tokens = Number(response.timings.predicted_n);
+
+    if (response.timings.hasOwnProperty("predicted_ms"))
+      usage.duration = Number(response.timings.predicted_ms) / 1000;
+  }
+
   /**
    * Parses and prepends reasoning from the response and returns the full text response.
    * @param {Object} response
@@ -184,15 +201,19 @@ class GenericOpenAiLLM {
     )
       return null;
 
+    const usage = {
+      prompt_tokens: result.output?.usage?.prompt_tokens || 0,
+      completion_tokens: result.output?.usage?.completion_tokens || 0,
+      total_tokens: result.output?.usage?.total_tokens || 0,
+      duration: result.duration,
+    };
+    this.#extractLlamaCppTimings(result.output, usage);
+
     return {
       textResponse: this.#parseReasoningFromResponse(result.output.choices[0]),
       metrics: {
-        prompt_tokens: result.output?.usage?.prompt_tokens || 0,
-        completion_tokens: result.output?.usage?.completion_tokens || 0,
-        total_tokens: result.output?.usage?.total_tokens || 0,
-        outputTps:
-          (result.output?.usage?.completion_tokens || 0) / result.duration,
-        duration: result.duration,
+        ...usage,
+        outputTps: usage.completion_tokens / usage.duration,
         model: this.model,
         provider: this.className,
         timestamp: new Date(),
@@ -332,6 +353,8 @@ class GenericOpenAiLLM {
               close: true,
               error: false,
             });
+            this.#extractLlamaCppTimings(chunk, usage);
+
             response.removeListener("close", handleAbort);
             stream?.endMeasurement(usage);
             resolve(fullText);