5112 or stream metrics and finish reason (#5117)

* update metric tracking for OR + fix finish_reason missing from transitive chunks

* linting + comments
closes #5113
resolves #5112
This commit is contained in:
Timothy Carambat
2026-03-02 18:53:29 -08:00
committed by GitHub
parent e145c391e9
commit 86431c6833

View File

@@ -304,13 +304,9 @@ class OpenRouterLLM {
user: user?.id ? `user_${user.id}` : "",
}),
messages,
// We have to manually count the tokens
// OpenRouter has a ton of providers and they all can return slightly differently
// some return chunk.usage on STOP, some do it after stop, its inconsistent.
// So it is possible reported metrics are inaccurate since we cannot reliably
// catch the metrics before resolving the stream - so we just pretend this functionality
// is not available.
runPromptTokenCalculation: true,
// OpenRouter returns the usage in the stream as the very last chunk **after** the finish reason.
// so we don't need to run the prompt token calculation.
runPromptTokenCalculation: false,
modelTag: this.model,
provider: this.className,
});
@@ -320,6 +316,8 @@ class OpenRouterLLM {
/**
* Handles the default stream response for a chat.
* - Handle weird OR timeout behavior where the stream never self-closes.
* - Handle the usage metrics being returned in the stream as the very last chunk **after** the finish reason.
* @param {import("express").Response} response
* @param {import('../../helpers/chat/LLMPerformanceMonitor').MonitoredStream} stream
* @param {Object} responseProps
@@ -328,6 +326,8 @@ class OpenRouterLLM {
handleStream(response, stream, responseProps) {
const timeoutThresholdMs = this.timeout;
const { uuid = uuidv4(), sources = [] } = responseProps;
let hasUsageMetrics = false;
let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
return new Promise(async (resolve) => {
let fullText = "";
@@ -336,14 +336,8 @@ class OpenRouterLLM {
let pplxCitations = []; // Array of inline citations for Perplexity models (if applicable)
let isPerplexity = this.isPerplexityModel;
// Establish listener to early-abort a streaming response
// in case things go sideways or the user does not like the response.
// We preserve the generated text but continue as if chat was completed
// to preserve previously generated content.
const handleAbort = () => {
stream?.endMeasurement({
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
});
stream?.endMeasurement(usage);
clientAbortedHandler(resolve, fullText);
};
response.on("close", handleAbort);
@@ -375,9 +369,7 @@ class OpenRouterLLM {
});
clearInterval(timeoutCheck);
response.removeListener("close", handleAbort);
stream?.endMeasurement({
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
});
stream?.endMeasurement(usage);
resolve(fullText);
}
}, 500);
@@ -389,6 +381,15 @@ class OpenRouterLLM {
const reasoningToken = message?.delta?.reasoning;
lastChunkTime = Number(new Date());
if (chunk.hasOwnProperty("usage") && !hasUsageMetrics) {
hasUsageMetrics = true;
usage = {
prompt_tokens: chunk.usage.prompt_tokens,
completion_tokens: chunk.usage.completion_tokens,
total_tokens: chunk.usage.total_tokens,
};
}
// Some models will return citations (e.g. Perplexity) - we should preserve them for inline citations if applicable.
if (
isPerplexity &&
@@ -464,7 +465,7 @@ class OpenRouterLLM {
});
}
if (message.finish_reason !== null) {
if (message?.finish_reason) {
writeResponseChunk(response, {
uuid,
sources,
@@ -473,14 +474,14 @@ class OpenRouterLLM {
close: true,
error: false,
});
response.removeListener("close", handleAbort);
clearInterval(timeoutCheck);
stream?.endMeasurement({
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
});
resolve(fullText);
}
}
// Stream completed naturally - resolve with final metrics
response.removeListener("close", handleAbort);
clearInterval(timeoutCheck);
stream?.endMeasurement(usage);
resolve(fullText);
} catch (e) {
writeResponseChunk(response, {
uuid,
@@ -492,9 +493,7 @@ class OpenRouterLLM {
});
response.removeListener("close", handleAbort);
clearInterval(timeoutCheck);
stream?.endMeasurement({
completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
});
stream?.endMeasurement(usage);
resolve(fullText);
}
});