fix: Use eval_duration for output TPS calculations in Ollama LLM provider (#4568)

* fix: Use eval_duration for output TPS calculations and add as a metric field

* refactor usage of eval_duration from ollama metrics

* move eval_duration to usage

* overwrite duration in ollama provider wip measureAsyncFunction optional param

* allow for overloaded duration in measureAsyncFunction

* simplify flow for duration tracking

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
jonathanortega2023
2025-11-20 15:02:47 -06:00
committed by GitHub
parent cf76bad452
commit 7a0c149d2e
2 changed files with 13 additions and 6 deletions

View File

@@ -263,6 +263,7 @@ class OllamaAILLM {
prompt_tokens: res.prompt_eval_count,
completion_tokens: res.eval_count,
total_tokens: res.prompt_eval_count + res.eval_count,
duration: res.eval_duration / 1e9,
},
};
})
@@ -282,8 +283,9 @@ class OllamaAILLM {
prompt_tokens: result.output.usage.prompt_tokens,
completion_tokens: result.output.usage.completion_tokens,
total_tokens: result.output.usage.total_tokens,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
outputTps:
result.output.usage.completion_tokens / result.output.usage.duration,
duration: result.output.usage.duration,
},
};
}
@@ -349,6 +351,7 @@ class OllamaAILLM {
if (chunk.done) {
usage.prompt_tokens = chunk.prompt_eval_count;
usage.completion_tokens = chunk.eval_count;
usage.duration = chunk.eval_duration / 1e9;
writeResponseChunk(response, {
uuid,
sources,

View File

@@ -39,6 +39,8 @@ class LLMPerformanceMonitor {
}
/**
* Wraps a function and logs the duration (in seconds) of the function call.
* If the output contains a `usage.duration` property, it will be used instead of the calculated duration.
* This allows providers to supply more accurate timing information.
* @param {Function} func
* @returns {Promise<{output: any, duration: number}>}
*/
@@ -47,7 +49,8 @@ class LLMPerformanceMonitor {
const start = Date.now();
const output = await func; // is a promise
const end = Date.now();
return { output, duration: (end - start) / 1000 };
const duration = output?.usage?.duration ?? (end - start) / 1000;
return { output, duration };
})();
}
@@ -77,19 +80,20 @@ class LLMPerformanceMonitor {
stream.endMeasurement = (reportedUsage = {}) => {
const end = Date.now();
const duration = (end - stream.start) / 1000;
const estimatedDuration = (end - stream.start) / 1000;
// Merge the reported usage with the existing metrics
// so the math in the metrics object is correct when calculating
stream.metrics = {
...stream.metrics,
...reportedUsage,
duration: reportedUsage?.duration ?? estimatedDuration,
};
stream.metrics.total_tokens =
stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0);
stream.metrics.outputTps = stream.metrics.completion_tokens / duration;
stream.metrics.duration = duration;
stream.metrics.outputTps =
stream.metrics.completion_tokens / stream.metrics.duration;
return stream.metrics;
};
return stream;