fix: stop runaway crash-recovery loop on dead sessions

Two distinct bugs were combining to keep a dead session restarting forever:

Bug 1 (uncaught "The operation was aborted."):
  child_process.spawn emits 'error' asynchronously for ENOENT/EACCES/abort
  signal aborts. spawnSdkProcess() never attached an 'error' listener, so
  any async spawn failure became uncaughtException and escaped to the
  daemon-level handler. Attach an 'error' listener immediately after spawn,
  before the !child.pid early-return, so async spawn errors are logged
  (with errno code) and swallowed locally.

Bug 2 (sliding-window limiter never trips on slow restart cadence):
  RestartGuard tripped only when restartTimestamps.length exceeded
  MAX_WINDOWED_RESTARTS (10) within RESTART_WINDOW_MS (60s). With the 8s
  exponential-backoff cap, only ~7-8 restarts fit in the window, so a dead
  session that fail-restart-fail-restart on 8s cycles would loop forever
  (consecutiveRestarts climbing past 30+ in observed logs). Add a
  consecutiveFailures counter that increments on every restart and resets
  only on recordSuccess(). Trip when consecutive failures exceed
  MAX_CONSECUTIVE_FAILURES (5) — meaning 5 restarts with zero successful
  processing in between proves the session is dead. Both guards now run in
  parallel: tight loops still trip the windowed cap; slow loops trip the
  consecutive-failure cap.

Also: when the SessionRoutes path trips the guard, drain pending messages
to 'abandoned' so the session does not reappear in
getSessionsWithPendingMessages and trigger another auto-start cycle. The
worker-service.ts path already does this via terminateSession.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-04-24 16:55:47 -07:00
parent 02201e64af
commit 43a7ab8964
4 changed files with 76 additions and 8 deletions

View File

@@ -819,12 +819,14 @@ export class WorkerService implements WorkerRef {
session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; // Keep for logging
if (!restartAllowed) {
logger.error('SYSTEM', 'Restart guard tripped: too many restarts in window, stopping to prevent runaway costs', {
logger.error('SYSTEM', 'Restart guard tripped: session is dead, terminating', {
sessionId: session.sessionDbId,
pendingCount,
restartsInWindow: session.restartGuard.restartsInWindow,
windowMs: session.restartGuard.windowMs,
maxRestarts: session.restartGuard.maxRestarts
maxRestarts: session.restartGuard.maxRestarts,
consecutiveFailures: session.restartGuard.consecutiveFailuresSinceSuccess,
maxConsecutiveFailures: session.restartGuard.maxConsecutiveFailures
});
session.consecutiveRestarts = 0;
this.terminateSession(session.sessionDbId, 'max_restarts_exceeded');

View File

@@ -3,15 +3,26 @@
* Prevents tight-loop restarts (bug) while allowing legitimate occasional restarts
* over long sessions. Replaces the flat consecutiveRestarts counter that stranded
* pending messages after just 3 restarts over any timeframe (#2053).
*
* TWO INDEPENDENT TRIPS:
* 1. Sliding window: more than MAX_WINDOWED_RESTARTS within RESTART_WINDOW_MS.
* Catches genuinely tight loops (e.g. crash every <6s).
* 2. Consecutive failures: more than MAX_CONSECUTIVE_FAILURES restarts with
* NO successful processing in between. Catches dead sessions that
* fail-restart-fail-restart on a slow exponential backoff cadence
* (e.g. 8s backoff cap + spawn failures = restartsInWindow stays under
* the windowed cap forever, but the session is clearly dead).
*/
const RESTART_WINDOW_MS = 60_000; // Only count restarts within last 60 seconds
const MAX_WINDOWED_RESTARTS = 10; // 10 restarts in 60s = runaway loop
const MAX_CONSECUTIVE_FAILURES = 5; // 5 restarts with no success in between = session is dead
const DECAY_AFTER_SUCCESS_MS = 5 * 60_000; // Clear history after 5min of uninterrupted success
export class RestartGuard {
private restartTimestamps: number[] = [];
private lastSuccessfulProcessing: number | null = null;
private consecutiveFailures: number = 0;
/**
* Record a restart and check if the guard should trip.
@@ -34,16 +45,23 @@ export class RestartGuard {
// Record this restart
this.restartTimestamps.push(now);
this.consecutiveFailures += 1;
// Check if we've exceeded the cap within the window
return this.restartTimestamps.length <= MAX_WINDOWED_RESTARTS;
// Trip if EITHER guard exceeds its limit:
// - Sliding window cap (tight loops)
// - Consecutive failures with no successful work (dead session, e.g. spawn always fails)
const withinWindowedCap = this.restartTimestamps.length <= MAX_WINDOWED_RESTARTS;
const withinConsecutiveCap = this.consecutiveFailures <= MAX_CONSECUTIVE_FAILURES;
return withinWindowedCap && withinConsecutiveCap;
}
/**
* Call when a message is successfully processed to update the success timestamp.
* Resets the consecutive-failure counter (real progress was made).
*/
recordSuccess(): void {
this.lastSuccessfulProcessing = Date.now();
this.consecutiveFailures = 0;
}
/**
@@ -67,4 +85,18 @@ export class RestartGuard {
get maxRestarts(): number {
return MAX_WINDOWED_RESTARTS;
}
/**
* Get consecutive failures since last successful processing (for logging).
*/
get consecutiveFailuresSinceSuccess(): number {
return this.consecutiveFailures;
}
/**
* Get the max allowed consecutive failures (for logging).
*/
get maxConsecutiveFailures(): number {
return MAX_CONSECUTIVE_FAILURES;
}
}

View File

@@ -315,16 +315,34 @@ export class SessionRoutes extends BaseRouteHandler {
session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; // Keep for logging
if (!restartAllowed) {
logger.error('SESSION', `CRITICAL: Restart guard tripped — too many restarts in window, stopping to prevent runaway costs`, {
logger.error('SESSION', `CRITICAL: Restart guard tripped — session is dead, draining pending messages and terminating`, {
sessionId: sessionDbId,
pendingCount,
restartsInWindow: session.restartGuard.restartsInWindow,
windowMs: session.restartGuard.windowMs,
maxRestarts: session.restartGuard.maxRestarts,
action: 'Generator will NOT restart. Check logs for root cause. Messages remain in pending state.'
consecutiveFailures: session.restartGuard.consecutiveFailuresSinceSuccess,
maxConsecutiveFailures: session.restartGuard.maxConsecutiveFailures,
action: 'Generator will NOT restart. Pending messages drained to abandoned. Check logs for root cause.'
});
// Don't restart - abort to prevent further API calls
// Don't restart - abort to prevent further API calls AND drain pending
// messages so the session doesn't reappear in getSessionsWithPendingMessages
// and trigger another auto-start cycle.
session.abortController.abort();
try {
const drained = pendingStore.transitionMessagesTo('abandoned', { sessionDbId });
if (drained > 0) {
logger.error('SESSION', 'Drained pending messages to abandoned after restart guard trip', {
sessionId: sessionDbId,
drained,
});
}
} catch (drainErr) {
const normalized = drainErr instanceof Error ? drainErr : new Error(String(drainErr));
logger.error('SESSION', 'Failed to drain pending messages after restart guard trip', {
sessionId: sessionDbId,
}, normalized);
}
return;
}
@@ -333,7 +351,9 @@ export class SessionRoutes extends BaseRouteHandler {
pendingCount,
consecutiveRestarts: session.consecutiveRestarts,
restartsInWindow: session.restartGuard!.restartsInWindow,
maxRestarts: session.restartGuard!.maxRestarts
maxRestarts: session.restartGuard!.maxRestarts,
consecutiveFailures: session.restartGuard!.consecutiveFailuresSinceSuccess,
maxConsecutiveFailures: session.restartGuard!.maxConsecutiveFailures
});
// Abort OLD controller before replacing to prevent child process leaks

View File

@@ -683,6 +683,20 @@ export function spawnSdkProcess(
windowsHide: true,
});
// ALWAYS attach an 'error' listener BEFORE any other code runs, regardless of
// whether the child has a PID. child_process.spawn emits 'error' asynchronously
// for ENOENT, EACCES, AbortSignal-driven aborts, etc. Without a listener these
// become uncaughtException — the cause of "The operation was aborted." escaping
// to the daemon during crash-recovery loops.
child.on('error', (err: Error) => {
logger.warn('SDK_SPAWN', `[session-${sessionDbId}] child emitted error event`, {
sessionDbId,
pid: child.pid,
errorName: err.name,
errorCode: (err as NodeJS.ErrnoException).code,
}, err);
});
if (!child.pid) {
logger.error('PROCESS', 'Spawn succeeded but produced no PID', { sessionDbId });
return null;