From 1ccf46815809d0765cc35e9696dc983fb4e388db Mon Sep 17 00:00:00 2001 From: Marcello Fitton <106866560+angelplusultra@users.noreply.github.com> Date: Thu, 12 Feb 2026 14:40:35 -0800 Subject: [PATCH] fix: correct TPS calculation for Generic OpenAI provider with llama.cpp (#4981) * add check for timings field on final chunk to override usage data * refactor: extract llama.cpp timings into reusable private method Move timings extraction into #extractTimings so it can be shared by both streaming (handleStream) and non-streaming (getChatCompletion) code paths. Co-Authored-By: Claude Opus 4.6 * lint and cleanup --------- Co-authored-by: Claude Opus 4.6 Co-authored-by: Timothy Carambat --- .../utils/AiProviders/genericOpenAi/index.js | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/server/utils/AiProviders/genericOpenAi/index.js b/server/utils/AiProviders/genericOpenAi/index.js index 7572a535b..1451b7e88 100644 --- a/server/utils/AiProviders/genericOpenAi/index.js +++ b/server/utils/AiProviders/genericOpenAi/index.js @@ -149,6 +149,23 @@ class GenericOpenAiLLM { ]; } + /** + * Extracts accurate generation-only timing and token count from a llama.cpp + * response or streaming chunk. Mutates the provided usage object in place + * so it can be used by both streaming and non-streaming code paths. + * @param {Object} response - the API response or final streaming chunk + * @param {Object} usage - the usage object to mutate + */ + #extractLlamaCppTimings(response, usage) { + if (!response || !response.timings) return; + + if (response.timings.hasOwnProperty("predicted_n")) + usage.completion_tokens = Number(response.timings.predicted_n); + + if (response.timings.hasOwnProperty("predicted_ms")) + usage.duration = Number(response.timings.predicted_ms) / 1000; + } + /** * Parses and prepends reasoning from the response and returns the full text response. * @param {Object} response @@ -184,15 +201,19 @@ class GenericOpenAiLLM { ) return null; + const usage = { + prompt_tokens: result.output?.usage?.prompt_tokens || 0, + completion_tokens: result.output?.usage?.completion_tokens || 0, + total_tokens: result.output?.usage?.total_tokens || 0, + duration: result.duration, + }; + this.#extractLlamaCppTimings(result.output, usage); + return { textResponse: this.#parseReasoningFromResponse(result.output.choices[0]), metrics: { - prompt_tokens: result.output?.usage?.prompt_tokens || 0, - completion_tokens: result.output?.usage?.completion_tokens || 0, - total_tokens: result.output?.usage?.total_tokens || 0, - outputTps: - (result.output?.usage?.completion_tokens || 0) / result.duration, - duration: result.duration, + ...usage, + outputTps: usage.completion_tokens / usage.duration, model: this.model, provider: this.className, timestamp: new Date(), @@ -332,6 +353,8 @@ class GenericOpenAiLLM { close: true, error: false, }); + this.#extractLlamaCppTimings(chunk, usage); + response.removeListener("close", handleAbort); stream?.endMeasurement(usage); resolve(fullText);