5112 or stream metrics and finish reason (#5117)

* update metric tracking for OR + fix finish_reason missing from transitive chunks * linting + comments closes #5113 resolves #5112
2026-04-25 17:15:37 +02:00 · 2026-03-02 18:53:29 -08:00
parent e145c391e9
commit 86431c6833
1 changed files with 26 additions and 27 deletions
--- a/server/utils/AiProviders/openRouter/index.js
+++ b/server/utils/AiProviders/openRouter/index.js
@@ -304,13 +304,9 @@ class OpenRouterLLM {
        user: user?.id ? `user_${user.id}` : "",
      }),
      messages,
-      // We have to manually count the tokens
-      // OpenRouter has a ton of providers and they all can return slightly differently
-      // some return chunk.usage on STOP, some do it after stop, its inconsistent.
-      // So it is possible reported metrics are inaccurate since we cannot reliably
-      // catch the metrics before resolving the stream - so we just pretend this functionality
-      // is not available.
-      runPromptTokenCalculation: true,
+      // OpenRouter returns the usage in the stream as the very last chunk **after** the finish reason.
+      // so we don't need to run the prompt token calculation.
+      runPromptTokenCalculation: false,
      modelTag: this.model,
      provider: this.className,
    });
@@ -320,6 +316,8 @@ class OpenRouterLLM {

  /**
   * Handles the default stream response for a chat.
+   * - Handle weird OR timeout behavior where the stream never self-closes.
+   * - Handle the usage metrics being returned in the stream as the very last chunk **after** the finish reason.
   * @param {import("express").Response} response
   * @param {import('../../helpers/chat/LLMPerformanceMonitor').MonitoredStream} stream
   * @param {Object} responseProps
@@ -328,6 +326,8 @@ class OpenRouterLLM {
  handleStream(response, stream, responseProps) {
    const timeoutThresholdMs = this.timeout;
    const { uuid = uuidv4(), sources = [] } = responseProps;
+    let hasUsageMetrics = false;
+    let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };

    return new Promise(async (resolve) => {
      let fullText = "";
@@ -336,14 +336,8 @@ class OpenRouterLLM {
      let pplxCitations = []; // Array of inline citations for Perplexity models (if applicable)
      let isPerplexity = this.isPerplexityModel;

-      // Establish listener to early-abort a streaming response
-      // in case things go sideways or the user does not like the response.
-      // We preserve the generated text but continue as if chat was completed
-      // to preserve previously generated content.
      const handleAbort = () => {
-        stream?.endMeasurement({
-          completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
-        });
+        stream?.endMeasurement(usage);
        clientAbortedHandler(resolve, fullText);
      };
      response.on("close", handleAbort);
@@ -375,9 +369,7 @@ class OpenRouterLLM {
          });
          clearInterval(timeoutCheck);
          response.removeListener("close", handleAbort);
-          stream?.endMeasurement({
-            completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
-          });
+          stream?.endMeasurement(usage);
          resolve(fullText);
        }
      }, 500);
@@ -389,6 +381,15 @@ class OpenRouterLLM {
          const reasoningToken = message?.delta?.reasoning;
          lastChunkTime = Number(new Date());

+          if (chunk.hasOwnProperty("usage") && !hasUsageMetrics) {
+            hasUsageMetrics = true;
+            usage = {
+              prompt_tokens: chunk.usage.prompt_tokens,
+              completion_tokens: chunk.usage.completion_tokens,
+              total_tokens: chunk.usage.total_tokens,
+            };
+          }
+
          // Some models will return citations (e.g. Perplexity) - we should preserve them for inline citations if applicable.
          if (
            isPerplexity &&
@@ -464,7 +465,7 @@ class OpenRouterLLM {
            });
          }

-          if (message.finish_reason !== null) {
+          if (message?.finish_reason) {
            writeResponseChunk(response, {
              uuid,
              sources,
@@ -473,14 +474,14 @@ class OpenRouterLLM {
              close: true,
              error: false,
            });
-            response.removeListener("close", handleAbort);
-            clearInterval(timeoutCheck);
-            stream?.endMeasurement({
-              completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
-            });
-            resolve(fullText);
          }
        }
+
+        // Stream completed naturally - resolve with final metrics
+        response.removeListener("close", handleAbort);
+        clearInterval(timeoutCheck);
+        stream?.endMeasurement(usage);
+        resolve(fullText);
      } catch (e) {
        writeResponseChunk(response, {
          uuid,
@@ -492,9 +493,7 @@ class OpenRouterLLM {
        });
        response.removeListener("close", handleAbort);
        clearInterval(timeoutCheck);
-        stream?.endMeasurement({
-          completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
-        });
+        stream?.endMeasurement(usage);
        resolve(fullText);
      }
    });