mirror of
https://github.com/thedotmack/claude-mem
synced 2026-04-25 17:15:04 +02:00
fix: stop runaway crash-recovery loop on dead sessions
Two distinct bugs were combining to keep a dead session restarting forever: Bug 1 (uncaught "The operation was aborted."): child_process.spawn emits 'error' asynchronously for ENOENT/EACCES/abort signal aborts. spawnSdkProcess() never attached an 'error' listener, so any async spawn failure became uncaughtException and escaped to the daemon-level handler. Attach an 'error' listener immediately after spawn, before the !child.pid early-return, so async spawn errors are logged (with errno code) and swallowed locally. Bug 2 (sliding-window limiter never trips on slow restart cadence): RestartGuard tripped only when restartTimestamps.length exceeded MAX_WINDOWED_RESTARTS (10) within RESTART_WINDOW_MS (60s). With the 8s exponential-backoff cap, only ~7-8 restarts fit in the window, so a dead session that fail-restart-fail-restart on 8s cycles would loop forever (consecutiveRestarts climbing past 30+ in observed logs). Add a consecutiveFailures counter that increments on every restart and resets only on recordSuccess(). Trip when consecutive failures exceed MAX_CONSECUTIVE_FAILURES (5) — meaning 5 restarts with zero successful processing in between proves the session is dead. Both guards now run in parallel: tight loops still trip the windowed cap; slow loops trip the consecutive-failure cap. Also: when the SessionRoutes path trips the guard, drain pending messages to 'abandoned' so the session does not reappear in getSessionsWithPendingMessages and trigger another auto-start cycle. The worker-service.ts path already does this via terminateSession. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -819,12 +819,14 @@ export class WorkerService implements WorkerRef {
|
||||
session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; // Keep for logging
|
||||
|
||||
if (!restartAllowed) {
|
||||
logger.error('SYSTEM', 'Restart guard tripped: too many restarts in window, stopping to prevent runaway costs', {
|
||||
logger.error('SYSTEM', 'Restart guard tripped: session is dead, terminating', {
|
||||
sessionId: session.sessionDbId,
|
||||
pendingCount,
|
||||
restartsInWindow: session.restartGuard.restartsInWindow,
|
||||
windowMs: session.restartGuard.windowMs,
|
||||
maxRestarts: session.restartGuard.maxRestarts
|
||||
maxRestarts: session.restartGuard.maxRestarts,
|
||||
consecutiveFailures: session.restartGuard.consecutiveFailuresSinceSuccess,
|
||||
maxConsecutiveFailures: session.restartGuard.maxConsecutiveFailures
|
||||
});
|
||||
session.consecutiveRestarts = 0;
|
||||
this.terminateSession(session.sessionDbId, 'max_restarts_exceeded');
|
||||
|
||||
@@ -3,15 +3,26 @@
|
||||
* Prevents tight-loop restarts (bug) while allowing legitimate occasional restarts
|
||||
* over long sessions. Replaces the flat consecutiveRestarts counter that stranded
|
||||
* pending messages after just 3 restarts over any timeframe (#2053).
|
||||
*
|
||||
* TWO INDEPENDENT TRIPS:
|
||||
* 1. Sliding window: more than MAX_WINDOWED_RESTARTS within RESTART_WINDOW_MS.
|
||||
* Catches genuinely tight loops (e.g. crash every <6s).
|
||||
* 2. Consecutive failures: more than MAX_CONSECUTIVE_FAILURES restarts with
|
||||
* NO successful processing in between. Catches dead sessions that
|
||||
* fail-restart-fail-restart on a slow exponential backoff cadence
|
||||
* (e.g. 8s backoff cap + spawn failures = restartsInWindow stays under
|
||||
* the windowed cap forever, but the session is clearly dead).
|
||||
*/
|
||||
|
||||
const RESTART_WINDOW_MS = 60_000; // Only count restarts within last 60 seconds
|
||||
const MAX_WINDOWED_RESTARTS = 10; // 10 restarts in 60s = runaway loop
|
||||
const MAX_CONSECUTIVE_FAILURES = 5; // 5 restarts with no success in between = session is dead
|
||||
const DECAY_AFTER_SUCCESS_MS = 5 * 60_000; // Clear history after 5min of uninterrupted success
|
||||
|
||||
export class RestartGuard {
|
||||
private restartTimestamps: number[] = [];
|
||||
private lastSuccessfulProcessing: number | null = null;
|
||||
private consecutiveFailures: number = 0;
|
||||
|
||||
/**
|
||||
* Record a restart and check if the guard should trip.
|
||||
@@ -34,16 +45,23 @@ export class RestartGuard {
|
||||
|
||||
// Record this restart
|
||||
this.restartTimestamps.push(now);
|
||||
this.consecutiveFailures += 1;
|
||||
|
||||
// Check if we've exceeded the cap within the window
|
||||
return this.restartTimestamps.length <= MAX_WINDOWED_RESTARTS;
|
||||
// Trip if EITHER guard exceeds its limit:
|
||||
// - Sliding window cap (tight loops)
|
||||
// - Consecutive failures with no successful work (dead session, e.g. spawn always fails)
|
||||
const withinWindowedCap = this.restartTimestamps.length <= MAX_WINDOWED_RESTARTS;
|
||||
const withinConsecutiveCap = this.consecutiveFailures <= MAX_CONSECUTIVE_FAILURES;
|
||||
return withinWindowedCap && withinConsecutiveCap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call when a message is successfully processed to update the success timestamp.
|
||||
* Resets the consecutive-failure counter (real progress was made).
|
||||
*/
|
||||
recordSuccess(): void {
|
||||
this.lastSuccessfulProcessing = Date.now();
|
||||
this.consecutiveFailures = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -67,4 +85,18 @@ export class RestartGuard {
|
||||
get maxRestarts(): number {
|
||||
return MAX_WINDOWED_RESTARTS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get consecutive failures since last successful processing (for logging).
|
||||
*/
|
||||
get consecutiveFailuresSinceSuccess(): number {
|
||||
return this.consecutiveFailures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the max allowed consecutive failures (for logging).
|
||||
*/
|
||||
get maxConsecutiveFailures(): number {
|
||||
return MAX_CONSECUTIVE_FAILURES;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -315,16 +315,34 @@ export class SessionRoutes extends BaseRouteHandler {
|
||||
session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; // Keep for logging
|
||||
|
||||
if (!restartAllowed) {
|
||||
logger.error('SESSION', `CRITICAL: Restart guard tripped — too many restarts in window, stopping to prevent runaway costs`, {
|
||||
logger.error('SESSION', `CRITICAL: Restart guard tripped — session is dead, draining pending messages and terminating`, {
|
||||
sessionId: sessionDbId,
|
||||
pendingCount,
|
||||
restartsInWindow: session.restartGuard.restartsInWindow,
|
||||
windowMs: session.restartGuard.windowMs,
|
||||
maxRestarts: session.restartGuard.maxRestarts,
|
||||
action: 'Generator will NOT restart. Check logs for root cause. Messages remain in pending state.'
|
||||
consecutiveFailures: session.restartGuard.consecutiveFailuresSinceSuccess,
|
||||
maxConsecutiveFailures: session.restartGuard.maxConsecutiveFailures,
|
||||
action: 'Generator will NOT restart. Pending messages drained to abandoned. Check logs for root cause.'
|
||||
});
|
||||
// Don't restart - abort to prevent further API calls
|
||||
// Don't restart - abort to prevent further API calls AND drain pending
|
||||
// messages so the session doesn't reappear in getSessionsWithPendingMessages
|
||||
// and trigger another auto-start cycle.
|
||||
session.abortController.abort();
|
||||
try {
|
||||
const drained = pendingStore.transitionMessagesTo('abandoned', { sessionDbId });
|
||||
if (drained > 0) {
|
||||
logger.error('SESSION', 'Drained pending messages to abandoned after restart guard trip', {
|
||||
sessionId: sessionDbId,
|
||||
drained,
|
||||
});
|
||||
}
|
||||
} catch (drainErr) {
|
||||
const normalized = drainErr instanceof Error ? drainErr : new Error(String(drainErr));
|
||||
logger.error('SESSION', 'Failed to drain pending messages after restart guard trip', {
|
||||
sessionId: sessionDbId,
|
||||
}, normalized);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -333,7 +351,9 @@ export class SessionRoutes extends BaseRouteHandler {
|
||||
pendingCount,
|
||||
consecutiveRestarts: session.consecutiveRestarts,
|
||||
restartsInWindow: session.restartGuard!.restartsInWindow,
|
||||
maxRestarts: session.restartGuard!.maxRestarts
|
||||
maxRestarts: session.restartGuard!.maxRestarts,
|
||||
consecutiveFailures: session.restartGuard!.consecutiveFailuresSinceSuccess,
|
||||
maxConsecutiveFailures: session.restartGuard!.maxConsecutiveFailures
|
||||
});
|
||||
|
||||
// Abort OLD controller before replacing to prevent child process leaks
|
||||
|
||||
@@ -683,6 +683,20 @@ export function spawnSdkProcess(
|
||||
windowsHide: true,
|
||||
});
|
||||
|
||||
// ALWAYS attach an 'error' listener BEFORE any other code runs, regardless of
|
||||
// whether the child has a PID. child_process.spawn emits 'error' asynchronously
|
||||
// for ENOENT, EACCES, AbortSignal-driven aborts, etc. Without a listener these
|
||||
// become uncaughtException — the cause of "The operation was aborted." escaping
|
||||
// to the daemon during crash-recovery loops.
|
||||
child.on('error', (err: Error) => {
|
||||
logger.warn('SDK_SPAWN', `[session-${sessionDbId}] child emitted error event`, {
|
||||
sessionDbId,
|
||||
pid: child.pid,
|
||||
errorName: err.name,
|
||||
errorCode: (err as NodeJS.ErrnoException).code,
|
||||
}, err);
|
||||
});
|
||||
|
||||
if (!child.pid) {
|
||||
logger.error('PROCESS', 'Spawn succeeded but produced no PID', { sessionDbId });
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user