fix: stop runaway crash-recovery loop on dead sessions

Two distinct bugs were combining to keep a dead session restarting forever: Bug 1 (uncaught "The operation was aborted."): child_process.spawn emits 'error' asynchronously for ENOENT/EACCES/abort signal aborts. spawnSdkProcess() never attached an 'error' listener, so any async spawn failure became uncaughtException and escaped to the daemon-level handler. Attach an 'error' listener immediately after spawn, before the !child.pid early-return, so async spawn errors are logged (with errno code) and swallowed locally. Bug 2 (sliding-window limiter never trips on slow restart cadence): RestartGuard tripped only when restartTimestamps.length exceeded MAX_WINDOWED_RESTARTS (10) within RESTART_WINDOW_MS (60s). With the 8s exponential-backoff cap, only ~7-8 restarts fit in the window, so a dead session that fail-restart-fail-restart on 8s cycles would loop forever (consecutiveRestarts climbing past 30+ in observed logs). Add a consecutiveFailures counter that increments on every restart and resets only on recordSuccess(). Trip when consecutive failures exceed MAX_CONSECUTIVE_FAILURES (5) — meaning 5 restarts with zero successful processing in between proves the session is dead. Both guards now run in parallel: tight loops still trip the windowed cap; slow loops trip the consecutive-failure cap. Also: when the SessionRoutes path trips the guard, drain pending messages to 'abandoned' so the session does not reappear in getSessionsWithPendingMessages and trigger another auto-start cycle. The worker-service.ts path already does this via terminateSession. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 17:15:04 +02:00 · 2026-04-24 16:55:47 -07:00
parent 02201e64af
commit 43a7ab8964
4 changed files with 76 additions and 8 deletions
--- a/src/services/worker-service.ts
+++ b/src/services/worker-service.ts
@@ -819,12 +819,14 @@ export class WorkerService implements WorkerRef {
          session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; // Keep for logging

          if (!restartAllowed) {
-            logger.error('SYSTEM', 'Restart guard tripped: too many restarts in window, stopping to prevent runaway costs', {
+            logger.error('SYSTEM', 'Restart guard tripped: session is dead, terminating', {
              sessionId: session.sessionDbId,
              pendingCount,
              restartsInWindow: session.restartGuard.restartsInWindow,
              windowMs: session.restartGuard.windowMs,
-              maxRestarts: session.restartGuard.maxRestarts
+              maxRestarts: session.restartGuard.maxRestarts,
+              consecutiveFailures: session.restartGuard.consecutiveFailuresSinceSuccess,
+              maxConsecutiveFailures: session.restartGuard.maxConsecutiveFailures
            });
            session.consecutiveRestarts = 0;
            this.terminateSession(session.sessionDbId, 'max_restarts_exceeded');
--- a/src/services/worker/RestartGuard.ts
+++ b/src/services/worker/RestartGuard.ts
@@ -3,15 +3,26 @@
 * Prevents tight-loop restarts (bug) while allowing legitimate occasional restarts
 * over long sessions. Replaces the flat consecutiveRestarts counter that stranded
 * pending messages after just 3 restarts over any timeframe (#2053).
+ *
+ * TWO INDEPENDENT TRIPS:
+ * 1. Sliding window: more than MAX_WINDOWED_RESTARTS within RESTART_WINDOW_MS.
+ *    Catches genuinely tight loops (e.g. crash every <6s).
+ * 2. Consecutive failures: more than MAX_CONSECUTIVE_FAILURES restarts with
+ *    NO successful processing in between. Catches dead sessions that
+ *    fail-restart-fail-restart on a slow exponential backoff cadence
+ *    (e.g. 8s backoff cap + spawn failures = restartsInWindow stays under
+ *    the windowed cap forever, but the session is clearly dead).
 */

 const RESTART_WINDOW_MS = 60_000;      // Only count restarts within last 60 seconds
 const MAX_WINDOWED_RESTARTS = 10;      // 10 restarts in 60s = runaway loop
+const MAX_CONSECUTIVE_FAILURES = 5;    // 5 restarts with no success in between = session is dead
 const DECAY_AFTER_SUCCESS_MS = 5 * 60_000; // Clear history after 5min of uninterrupted success

 export class RestartGuard {
  private restartTimestamps: number[] = [];
  private lastSuccessfulProcessing: number | null = null;
+  private consecutiveFailures: number = 0;

  /**
   * Record a restart and check if the guard should trip.
@@ -34,16 +45,23 @@ export class RestartGuard {

    // Record this restart
    this.restartTimestamps.push(now);
+    this.consecutiveFailures += 1;

-    // Check if we've exceeded the cap within the window
-    return this.restartTimestamps.length <= MAX_WINDOWED_RESTARTS;
+    // Trip if EITHER guard exceeds its limit:
+    //   - Sliding window cap (tight loops)
+    //   - Consecutive failures with no successful work (dead session, e.g. spawn always fails)
+    const withinWindowedCap = this.restartTimestamps.length <= MAX_WINDOWED_RESTARTS;
+    const withinConsecutiveCap = this.consecutiveFailures <= MAX_CONSECUTIVE_FAILURES;
+    return withinWindowedCap && withinConsecutiveCap;
  }

  /**
   * Call when a message is successfully processed to update the success timestamp.
+   * Resets the consecutive-failure counter (real progress was made).
   */
  recordSuccess(): void {
    this.lastSuccessfulProcessing = Date.now();
+    this.consecutiveFailures = 0;
  }

  /**
@@ -67,4 +85,18 @@ export class RestartGuard {
  get maxRestarts(): number {
    return MAX_WINDOWED_RESTARTS;
  }
+
+  /**
+   * Get consecutive failures since last successful processing (for logging).
+   */
+  get consecutiveFailuresSinceSuccess(): number {
+    return this.consecutiveFailures;
+  }
+
+  /**
+   * Get the max allowed consecutive failures (for logging).
+   */
+  get maxConsecutiveFailures(): number {
+    return MAX_CONSECUTIVE_FAILURES;
+  }
 }
--- a/src/services/worker/http/routes/SessionRoutes.ts
+++ b/src/services/worker/http/routes/SessionRoutes.ts
@@ -315,16 +315,34 @@ export class SessionRoutes extends BaseRouteHandler {
            session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; // Keep for logging

            if (!restartAllowed) {
-              logger.error('SESSION', `CRITICAL: Restart guard tripped — too many restarts in window, stopping to prevent runaway costs`, {
+              logger.error('SESSION', `CRITICAL: Restart guard tripped — session is dead, draining pending messages and terminating`, {
                sessionId: sessionDbId,
                pendingCount,
                restartsInWindow: session.restartGuard.restartsInWindow,
                windowMs: session.restartGuard.windowMs,
                maxRestarts: session.restartGuard.maxRestarts,
-                action: 'Generator will NOT restart. Check logs for root cause. Messages remain in pending state.'
+                consecutiveFailures: session.restartGuard.consecutiveFailuresSinceSuccess,
+                maxConsecutiveFailures: session.restartGuard.maxConsecutiveFailures,
+                action: 'Generator will NOT restart. Pending messages drained to abandoned. Check logs for root cause.'
              });
-              // Don't restart - abort to prevent further API calls
+              // Don't restart - abort to prevent further API calls AND drain pending
+              // messages so the session doesn't reappear in getSessionsWithPendingMessages
+              // and trigger another auto-start cycle.
              session.abortController.abort();
+              try {
+                const drained = pendingStore.transitionMessagesTo('abandoned', { sessionDbId });
+                if (drained > 0) {
+                  logger.error('SESSION', 'Drained pending messages to abandoned after restart guard trip', {
+                    sessionId: sessionDbId,
+                    drained,
+                  });
+                }
+              } catch (drainErr) {
+                const normalized = drainErr instanceof Error ? drainErr : new Error(String(drainErr));
+                logger.error('SESSION', 'Failed to drain pending messages after restart guard trip', {
+                  sessionId: sessionDbId,
+                }, normalized);
+              }
              return;
            }

@@ -333,7 +351,9 @@ export class SessionRoutes extends BaseRouteHandler {
              pendingCount,
              consecutiveRestarts: session.consecutiveRestarts,
              restartsInWindow: session.restartGuard!.restartsInWindow,
-              maxRestarts: session.restartGuard!.maxRestarts
+              maxRestarts: session.restartGuard!.maxRestarts,
+              consecutiveFailures: session.restartGuard!.consecutiveFailuresSinceSuccess,
+              maxConsecutiveFailures: session.restartGuard!.maxConsecutiveFailures
            });

            // Abort OLD controller before replacing to prevent child process leaks
--- a/src/supervisor/process-registry.ts
+++ b/src/supervisor/process-registry.ts
@@ -683,6 +683,20 @@ export function spawnSdkProcess(
        windowsHide: true,
      });

+  // ALWAYS attach an 'error' listener BEFORE any other code runs, regardless of
+  // whether the child has a PID. child_process.spawn emits 'error' asynchronously
+  // for ENOENT, EACCES, AbortSignal-driven aborts, etc. Without a listener these
+  // become uncaughtException — the cause of "The operation was aborted." escaping
+  // to the daemon during crash-recovery loops.
+  child.on('error', (err: Error) => {
+    logger.warn('SDK_SPAWN', `[session-${sessionDbId}] child emitted error event`, {
+      sessionDbId,
+      pid: child.pid,
+      errorName: err.name,
+      errorCode: (err as NodeJS.ErrnoException).code,
+    }, err);
+  });
+
  if (!child.pid) {
    logger.error('PROCESS', 'Spawn succeeded but produced no PID', { sessionDbId });
    return null;