fix: correct TPS calculation for Generic OpenAI provider with llama.cpp (#4981)

* add check for timings field on final chunk to override usage data

* refactor: extract llama.cpp timings into reusable private method

Move timings extraction into #extractTimings so it can be shared
by both streaming (handleStream) and non-streaming (getChatCompletion)
code paths.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* lint and cleanup

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Marcello Fitton
2026-02-12 14:40:35 -08:00
committed by GitHub
parent 5fb1281891
commit 1ccf468158

View File

@@ -149,6 +149,23 @@ class GenericOpenAiLLM {
];
}
/**
* Extracts accurate generation-only timing and token count from a llama.cpp
* response or streaming chunk. Mutates the provided usage object in place
* so it can be used by both streaming and non-streaming code paths.
* @param {Object} response - the API response or final streaming chunk
* @param {Object} usage - the usage object to mutate
*/
#extractLlamaCppTimings(response, usage) {
if (!response || !response.timings) return;
if (response.timings.hasOwnProperty("predicted_n"))
usage.completion_tokens = Number(response.timings.predicted_n);
if (response.timings.hasOwnProperty("predicted_ms"))
usage.duration = Number(response.timings.predicted_ms) / 1000;
}
/**
* Parses and prepends reasoning from the response and returns the full text response.
* @param {Object} response
@@ -184,15 +201,19 @@ class GenericOpenAiLLM {
)
return null;
const usage = {
prompt_tokens: result.output?.usage?.prompt_tokens || 0,
completion_tokens: result.output?.usage?.completion_tokens || 0,
total_tokens: result.output?.usage?.total_tokens || 0,
duration: result.duration,
};
this.#extractLlamaCppTimings(result.output, usage);
return {
textResponse: this.#parseReasoningFromResponse(result.output.choices[0]),
metrics: {
prompt_tokens: result.output?.usage?.prompt_tokens || 0,
completion_tokens: result.output?.usage?.completion_tokens || 0,
total_tokens: result.output?.usage?.total_tokens || 0,
outputTps:
(result.output?.usage?.completion_tokens || 0) / result.duration,
duration: result.duration,
...usage,
outputTps: usage.completion_tokens / usage.duration,
model: this.model,
provider: this.className,
timestamp: new Date(),
@@ -332,6 +353,8 @@ class GenericOpenAiLLM {
close: true,
error: false,
});
this.#extractLlamaCppTimings(chunk, usage);
response.removeListener("close", handleAbort);
stream?.endMeasurement(usage);
resolve(fullText);