mirror of
https://github.com/Mintplex-Labs/anything-llm
synced 2026-04-25 17:15:37 +02:00
5112 or stream metrics and finish reason (#5117)
* update metric tracking for OR + fix finish_reason missing from transitive chunks * linting + comments closes #5113 resolves #5112
This commit is contained in:
@@ -304,13 +304,9 @@ class OpenRouterLLM {
|
||||
user: user?.id ? `user_${user.id}` : "",
|
||||
}),
|
||||
messages,
|
||||
// We have to manually count the tokens
|
||||
// OpenRouter has a ton of providers and they all can return slightly differently
|
||||
// some return chunk.usage on STOP, some do it after stop, its inconsistent.
|
||||
// So it is possible reported metrics are inaccurate since we cannot reliably
|
||||
// catch the metrics before resolving the stream - so we just pretend this functionality
|
||||
// is not available.
|
||||
runPromptTokenCalculation: true,
|
||||
// OpenRouter returns the usage in the stream as the very last chunk **after** the finish reason.
|
||||
// so we don't need to run the prompt token calculation.
|
||||
runPromptTokenCalculation: false,
|
||||
modelTag: this.model,
|
||||
provider: this.className,
|
||||
});
|
||||
@@ -320,6 +316,8 @@ class OpenRouterLLM {
|
||||
|
||||
/**
|
||||
* Handles the default stream response for a chat.
|
||||
* - Handle weird OR timeout behavior where the stream never self-closes.
|
||||
* - Handle the usage metrics being returned in the stream as the very last chunk **after** the finish reason.
|
||||
* @param {import("express").Response} response
|
||||
* @param {import('../../helpers/chat/LLMPerformanceMonitor').MonitoredStream} stream
|
||||
* @param {Object} responseProps
|
||||
@@ -328,6 +326,8 @@ class OpenRouterLLM {
|
||||
handleStream(response, stream, responseProps) {
|
||||
const timeoutThresholdMs = this.timeout;
|
||||
const { uuid = uuidv4(), sources = [] } = responseProps;
|
||||
let hasUsageMetrics = false;
|
||||
let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
let fullText = "";
|
||||
@@ -336,14 +336,8 @@ class OpenRouterLLM {
|
||||
let pplxCitations = []; // Array of inline citations for Perplexity models (if applicable)
|
||||
let isPerplexity = this.isPerplexityModel;
|
||||
|
||||
// Establish listener to early-abort a streaming response
|
||||
// in case things go sideways or the user does not like the response.
|
||||
// We preserve the generated text but continue as if chat was completed
|
||||
// to preserve previously generated content.
|
||||
const handleAbort = () => {
|
||||
stream?.endMeasurement({
|
||||
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
|
||||
});
|
||||
stream?.endMeasurement(usage);
|
||||
clientAbortedHandler(resolve, fullText);
|
||||
};
|
||||
response.on("close", handleAbort);
|
||||
@@ -375,9 +369,7 @@ class OpenRouterLLM {
|
||||
});
|
||||
clearInterval(timeoutCheck);
|
||||
response.removeListener("close", handleAbort);
|
||||
stream?.endMeasurement({
|
||||
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
|
||||
});
|
||||
stream?.endMeasurement(usage);
|
||||
resolve(fullText);
|
||||
}
|
||||
}, 500);
|
||||
@@ -389,6 +381,15 @@ class OpenRouterLLM {
|
||||
const reasoningToken = message?.delta?.reasoning;
|
||||
lastChunkTime = Number(new Date());
|
||||
|
||||
if (chunk.hasOwnProperty("usage") && !hasUsageMetrics) {
|
||||
hasUsageMetrics = true;
|
||||
usage = {
|
||||
prompt_tokens: chunk.usage.prompt_tokens,
|
||||
completion_tokens: chunk.usage.completion_tokens,
|
||||
total_tokens: chunk.usage.total_tokens,
|
||||
};
|
||||
}
|
||||
|
||||
// Some models will return citations (e.g. Perplexity) - we should preserve them for inline citations if applicable.
|
||||
if (
|
||||
isPerplexity &&
|
||||
@@ -464,7 +465,7 @@ class OpenRouterLLM {
|
||||
});
|
||||
}
|
||||
|
||||
if (message.finish_reason !== null) {
|
||||
if (message?.finish_reason) {
|
||||
writeResponseChunk(response, {
|
||||
uuid,
|
||||
sources,
|
||||
@@ -473,14 +474,14 @@ class OpenRouterLLM {
|
||||
close: true,
|
||||
error: false,
|
||||
});
|
||||
response.removeListener("close", handleAbort);
|
||||
clearInterval(timeoutCheck);
|
||||
stream?.endMeasurement({
|
||||
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
|
||||
});
|
||||
resolve(fullText);
|
||||
}
|
||||
}
|
||||
|
||||
// Stream completed naturally - resolve with final metrics
|
||||
response.removeListener("close", handleAbort);
|
||||
clearInterval(timeoutCheck);
|
||||
stream?.endMeasurement(usage);
|
||||
resolve(fullText);
|
||||
} catch (e) {
|
||||
writeResponseChunk(response, {
|
||||
uuid,
|
||||
@@ -492,9 +493,7 @@ class OpenRouterLLM {
|
||||
});
|
||||
response.removeListener("close", handleAbort);
|
||||
clearInterval(timeoutCheck);
|
||||
stream?.endMeasurement({
|
||||
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
|
||||
});
|
||||
stream?.endMeasurement(usage);
|
||||
resolve(fullText);
|
||||
}
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user