mirror of
https://github.com/Mintplex-Labs/anything-llm
synced 2026-04-25 17:15:37 +02:00
fix: Use eval_duration for output TPS calculations in Ollama LLM provider (#4568)
* fix: Use eval_duration for output TPS calculations and add as a metric field * refactor usage of eval_duration from ollama metrics * move eval_duration to usage * overwrite duration in ollama provider wip measureAsyncFunction optional param * allow for overloaded duration in measureAsyncFunction * simplify flow for duration tracking --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
committed by
GitHub
parent
cf76bad452
commit
7a0c149d2e
@@ -263,6 +263,7 @@ class OllamaAILLM {
|
||||
prompt_tokens: res.prompt_eval_count,
|
||||
completion_tokens: res.eval_count,
|
||||
total_tokens: res.prompt_eval_count + res.eval_count,
|
||||
duration: res.eval_duration / 1e9,
|
||||
},
|
||||
};
|
||||
})
|
||||
@@ -282,8 +283,9 @@ class OllamaAILLM {
|
||||
prompt_tokens: result.output.usage.prompt_tokens,
|
||||
completion_tokens: result.output.usage.completion_tokens,
|
||||
total_tokens: result.output.usage.total_tokens,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
outputTps:
|
||||
result.output.usage.completion_tokens / result.output.usage.duration,
|
||||
duration: result.output.usage.duration,
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -349,6 +351,7 @@ class OllamaAILLM {
|
||||
if (chunk.done) {
|
||||
usage.prompt_tokens = chunk.prompt_eval_count;
|
||||
usage.completion_tokens = chunk.eval_count;
|
||||
usage.duration = chunk.eval_duration / 1e9;
|
||||
writeResponseChunk(response, {
|
||||
uuid,
|
||||
sources,
|
||||
|
||||
@@ -39,6 +39,8 @@ class LLMPerformanceMonitor {
|
||||
}
|
||||
/**
|
||||
* Wraps a function and logs the duration (in seconds) of the function call.
|
||||
* If the output contains a `usage.duration` property, it will be used instead of the calculated duration.
|
||||
* This allows providers to supply more accurate timing information.
|
||||
* @param {Function} func
|
||||
* @returns {Promise<{output: any, duration: number}>}
|
||||
*/
|
||||
@@ -47,7 +49,8 @@ class LLMPerformanceMonitor {
|
||||
const start = Date.now();
|
||||
const output = await func; // is a promise
|
||||
const end = Date.now();
|
||||
return { output, duration: (end - start) / 1000 };
|
||||
const duration = output?.usage?.duration ?? (end - start) / 1000;
|
||||
return { output, duration };
|
||||
})();
|
||||
}
|
||||
|
||||
@@ -77,19 +80,20 @@ class LLMPerformanceMonitor {
|
||||
|
||||
stream.endMeasurement = (reportedUsage = {}) => {
|
||||
const end = Date.now();
|
||||
const duration = (end - stream.start) / 1000;
|
||||
const estimatedDuration = (end - stream.start) / 1000;
|
||||
|
||||
// Merge the reported usage with the existing metrics
|
||||
// so the math in the metrics object is correct when calculating
|
||||
stream.metrics = {
|
||||
...stream.metrics,
|
||||
...reportedUsage,
|
||||
duration: reportedUsage?.duration ?? estimatedDuration,
|
||||
};
|
||||
|
||||
stream.metrics.total_tokens =
|
||||
stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0);
|
||||
stream.metrics.outputTps = stream.metrics.completion_tokens / duration;
|
||||
stream.metrics.duration = duration;
|
||||
stream.metrics.outputTps =
|
||||
stream.metrics.completion_tokens / stream.metrics.duration;
|
||||
return stream.metrics;
|
||||
};
|
||||
return stream;
|
||||
|
||||
Reference in New Issue
Block a user