Files
claude-mem/src/services/worker/ProcessRegistry.ts
Alex Newman a0dd516cd5 fix: resolve all 301 error handling anti-patterns across codebase
Systematic cleanup of every error handling anti-pattern detected by the
automated scanner. 289 issues fixed via code changes, 12 approved with
specific technical justifications.

Changes across 90 files:
- GENERIC_CATCH (141): Added instanceof Error type discrimination
- LARGE_TRY_BLOCK (82): Extracted helper methods to narrow try scope to ≤10 lines
- NO_LOGGING_IN_CATCH (65): Added logger/console calls for error visibility
- CATCH_AND_CONTINUE_CRITICAL_PATH (10): Added throw/return or approved overrides
- ERROR_STRING_MATCHING (2): Approved with rationale (no typed error classes)
- ERROR_MESSAGE_GUESSING (1): Replaced chained .includes() with documented pattern array
- PROMISE_CATCH_NO_LOGGING (1): Added logging to .catch() handler

Also fixes a detector bug where nested try/catch inside a catch block
corrupted brace-depth tracking, causing false positives.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-19 19:57:00 -07:00

512 lines
16 KiB
TypeScript

/**
* ProcessRegistry: Track spawned Claude subprocesses
*
* Fixes Issue #737: Claude haiku subprocesses don't terminate properly,
* causing zombie process accumulation (user reported 155 processes / 51GB RAM).
*
* Root causes:
* 1. SDK's SpawnedProcess interface hides subprocess PIDs
* 2. deleteSession() doesn't verify subprocess exit before cleanup
* 3. abort() is fire-and-forget with no confirmation
*
* Solution:
* - Use SDK's spawnClaudeCodeProcess option to capture PIDs
* - Track all spawned processes with session association
* - Verify exit on session deletion with timeout + SIGKILL escalation
* - Safety net orphan reaper runs every 5 minutes
*/
import { spawn, exec, ChildProcess } from 'child_process';
import { promisify } from 'util';
import { logger } from '../../utils/logger.js';
import { sanitizeEnv } from '../../supervisor/env-sanitizer.js';
import { getSupervisor } from '../../supervisor/index.js';
const execAsync = promisify(exec);
interface TrackedProcess {
pid: number;
sessionDbId: number;
spawnedAt: number;
process: ChildProcess;
}
function getTrackedProcesses(): TrackedProcess[] {
return getSupervisor().getRegistry()
.getAll()
.filter(record => record.type === 'sdk')
.map((record) => {
const processRef = getSupervisor().getRegistry().getRuntimeProcess(record.id);
if (!processRef) {
return null;
}
return {
pid: record.pid,
sessionDbId: Number(record.sessionId),
spawnedAt: Date.parse(record.startedAt),
process: processRef
};
})
.filter((value): value is TrackedProcess => value !== null);
}
/**
* Register a spawned process in the registry
*/
export function registerProcess(pid: number, sessionDbId: number, process: ChildProcess): void {
getSupervisor().registerProcess(`sdk:${sessionDbId}:${pid}`, {
pid,
type: 'sdk',
sessionId: sessionDbId,
startedAt: new Date().toISOString()
}, process);
logger.info('PROCESS', `Registered PID ${pid} for session ${sessionDbId}`, { pid, sessionDbId });
}
/**
* Unregister a process from the registry and notify pool waiters
*/
export function unregisterProcess(pid: number): void {
for (const record of getSupervisor().getRegistry().getByPid(pid)) {
if (record.type === 'sdk') {
getSupervisor().unregisterProcess(record.id);
}
}
logger.debug('PROCESS', `Unregistered PID ${pid}`, { pid });
// Notify waiters that a pool slot may be available
notifySlotAvailable();
}
/**
* Get process info by session ID
* Warns if multiple processes found (indicates race condition)
*/
export function getProcessBySession(sessionDbId: number): TrackedProcess | undefined {
const matches = getTrackedProcesses().filter(info => info.sessionDbId === sessionDbId);
if (matches.length > 1) {
logger.warn('PROCESS', `Multiple processes found for session ${sessionDbId}`, {
count: matches.length,
pids: matches.map(m => m.pid)
});
}
return matches[0];
}
/**
* Get count of active processes in the registry
*/
export function getActiveCount(): number {
return getSupervisor().getRegistry().getAll().filter(record => record.type === 'sdk').length;
}
// Waiters for pool slots - resolved when a process exits and frees a slot
const slotWaiters: Array<() => void> = [];
/**
* Notify waiters that a slot has freed up
*/
function notifySlotAvailable(): void {
const waiter = slotWaiters.shift();
if (waiter) waiter();
}
/**
* Wait for a pool slot to become available (promise-based, not polling)
* @param maxConcurrent Max number of concurrent agents
* @param timeoutMs Max time to wait before giving up
*/
const TOTAL_PROCESS_HARD_CAP = 10;
export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_000): Promise<void> {
// Hard cap: refuse to spawn if too many processes exist regardless of pool accounting
const activeCount = getActiveCount();
if (activeCount >= TOTAL_PROCESS_HARD_CAP) {
throw new Error(`Hard cap exceeded: ${activeCount} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`);
}
if (activeCount < maxConcurrent) return;
logger.info('PROCESS', `Pool limit reached (${activeCount}/${maxConcurrent}), waiting for slot...`);
return new Promise<void>((resolve, reject) => {
const timeout = setTimeout(() => {
const idx = slotWaiters.indexOf(onSlot);
if (idx >= 0) slotWaiters.splice(idx, 1);
reject(new Error(`Timed out waiting for agent pool slot after ${timeoutMs}ms`));
}, timeoutMs);
const onSlot = () => {
clearTimeout(timeout);
if (getActiveCount() < maxConcurrent) {
resolve();
} else {
// Still full, re-queue
slotWaiters.push(onSlot);
}
};
slotWaiters.push(onSlot);
});
}
/**
* Get all active PIDs (for debugging)
*/
export function getActiveProcesses(): Array<{ pid: number; sessionDbId: number; ageMs: number }> {
const now = Date.now();
return getTrackedProcesses().map(info => ({
pid: info.pid,
sessionDbId: info.sessionDbId,
ageMs: now - info.spawnedAt
}));
}
/**
* Wait for a process to exit with timeout, escalating to SIGKILL if needed
* Uses event-based waiting instead of polling to avoid CPU overhead
*/
export async function ensureProcessExit(tracked: TrackedProcess, timeoutMs: number = 5000): Promise<void> {
const { pid, process: proc } = tracked;
// Already exited? Only trust exitCode, NOT proc.killed
// proc.killed only means Node sent a signal — the process can still be alive
if (proc.exitCode !== null) {
unregisterProcess(pid);
return;
}
// Wait for graceful exit with timeout using event-based approach
const exitPromise = new Promise<void>((resolve) => {
proc.once('exit', () => resolve());
});
const timeoutPromise = new Promise<void>((resolve) => {
setTimeout(resolve, timeoutMs);
});
await Promise.race([exitPromise, timeoutPromise]);
// Check if exited gracefully — only trust exitCode
if (proc.exitCode !== null) {
unregisterProcess(pid);
return;
}
// Timeout: escalate to SIGKILL
logger.warn('PROCESS', `PID ${pid} did not exit after ${timeoutMs}ms, sending SIGKILL`, { pid, timeoutMs });
try {
proc.kill('SIGKILL');
} catch {
// Already dead
}
// Wait for SIGKILL to take effect — use exit event with 1s timeout instead of blind sleep
const sigkillExitPromise = new Promise<void>((resolve) => {
proc.once('exit', () => resolve());
});
const sigkillTimeout = new Promise<void>((resolve) => {
setTimeout(resolve, 1000);
});
await Promise.race([sigkillExitPromise, sigkillTimeout]);
unregisterProcess(pid);
}
/**
* Kill idle daemon children (claude processes spawned by worker-service)
*
* These are SDK-spawned claude processes that completed their work but
* didn't terminate properly. They remain as children of the worker-service
* daemon, consuming memory without doing useful work.
*
* Criteria for cleanup:
* - Process name is "claude"
* - Parent PID is the worker-service daemon (this process)
* - Process has 0% CPU (idle)
* - Process has been running for more than 2 minutes
*/
async function killIdleDaemonChildren(): Promise<number> {
if (process.platform === 'win32') {
// Windows: Different process model, skip for now
return 0;
}
const daemonPid = process.pid;
let killed = 0;
try {
const { stdout } = await execAsync(
'ps -eo pid,ppid,%cpu,etime,comm 2>/dev/null | grep "claude$" || true'
);
for (const line of stdout.trim().split('\n')) {
if (!line) continue;
const parts = line.trim().split(/\s+/);
if (parts.length < 5) continue;
const [pidStr, ppidStr, cpuStr, etime] = parts;
const pid = parseInt(pidStr, 10);
const ppid = parseInt(ppidStr, 10);
const cpu = parseFloat(cpuStr);
// Skip if not a child of this daemon
if (ppid !== daemonPid) continue;
// Skip if actively using CPU
if (cpu > 0) continue;
// Parse elapsed time to minutes
// Formats: MM:SS, HH:MM:SS, D-HH:MM:SS
let minutes = 0;
const dayMatch = etime.match(/^(\d+)-(\d+):(\d+):(\d+)$/);
const hourMatch = etime.match(/^(\d+):(\d+):(\d+)$/);
const minMatch = etime.match(/^(\d+):(\d+)$/);
if (dayMatch) {
minutes = parseInt(dayMatch[1], 10) * 24 * 60 +
parseInt(dayMatch[2], 10) * 60 +
parseInt(dayMatch[3], 10);
} else if (hourMatch) {
minutes = parseInt(hourMatch[1], 10) * 60 +
parseInt(hourMatch[2], 10);
} else if (minMatch) {
minutes = parseInt(minMatch[1], 10);
}
// Kill if idle for more than 1 minute
if (minutes >= 1) {
logger.info('PROCESS', `Killing idle daemon child PID ${pid} (idle ${minutes}m)`, { pid, minutes });
try {
process.kill(pid, 'SIGKILL');
killed++;
} catch {
// Already dead or permission denied
}
}
}
} catch {
// No matches or command error
}
return killed;
}
/**
* Kill system-level orphans (ppid=1 on Unix)
* These are Claude processes whose parent died unexpectedly
*/
async function killSystemOrphans(): Promise<number> {
if (process.platform === 'win32') {
return 0; // Windows doesn't have ppid=1 orphan concept
}
try {
const { stdout } = await execAsync(
'ps -eo pid,ppid,args 2>/dev/null | grep -E "claude.*haiku|claude.*output-format" | grep -v grep'
);
let killed = 0;
for (const line of stdout.trim().split('\n')) {
if (!line) continue;
const match = line.trim().match(/^(\d+)\s+(\d+)/);
if (match && parseInt(match[2]) === 1) { // ppid=1 = orphan
const orphanPid = parseInt(match[1]);
logger.warn('PROCESS', `Killing system orphan PID ${orphanPid}`, { pid: orphanPid });
try {
process.kill(orphanPid, 'SIGKILL');
killed++;
} catch {
// Already dead or permission denied
}
}
}
return killed;
} catch {
return 0; // No matches or error
}
}
/**
* Reap orphaned processes - both registry-tracked and system-level
*/
export async function reapOrphanedProcesses(activeSessionIds: Set<number>): Promise<number> {
let killed = 0;
// Registry-based: kill processes for dead sessions
for (const record of getSupervisor().getRegistry().getAll().filter(entry => entry.type === 'sdk')) {
const pid = record.pid;
const sessionDbId = Number(record.sessionId);
const processRef = getSupervisor().getRegistry().getRuntimeProcess(record.id);
if (activeSessionIds.has(sessionDbId)) continue; // Active = safe
logger.warn('PROCESS', `Killing orphan PID ${pid} (session ${sessionDbId} gone)`, { pid, sessionDbId });
try {
if (processRef) {
processRef.kill('SIGKILL');
} else {
process.kill(pid, 'SIGKILL');
}
killed++;
} catch {
// Already dead
}
getSupervisor().unregisterProcess(record.id);
notifySlotAvailable();
}
// System-level: find ppid=1 orphans
killed += await killSystemOrphans();
// Daemon children: find idle SDK processes that didn't terminate
killed += await killIdleDaemonChildren();
return killed;
}
/**
* Create a custom spawn function for SDK that captures PIDs
*
* The SDK's spawnClaudeCodeProcess option allows us to intercept subprocess
* creation and capture the PID before the SDK hides it.
*
* NOTE: Session isolation is handled via the `cwd` option in SDKAgent.ts,
* NOT via CLAUDE_CONFIG_DIR (which breaks authentication).
*/
export function createPidCapturingSpawn(sessionDbId: number) {
return (spawnOptions: {
command: string;
args: string[];
cwd?: string;
env?: NodeJS.ProcessEnv;
signal?: AbortSignal;
}) => {
// Kill any existing process for this session before spawning a new one.
// Multiple processes sharing the same --resume UUID waste API credits and
// can conflict with each other (Issue #1590).
const existing = getProcessBySession(sessionDbId);
if (existing && existing.process.exitCode === null) {
logger.warn('PROCESS', `Killing duplicate process PID ${existing.pid} before spawning new one for session ${sessionDbId}`, {
existingPid: existing.pid,
sessionDbId
});
let exited = false;
try {
existing.process.kill('SIGTERM');
exited = existing.process.exitCode !== null;
} catch (error: unknown) {
// Already dead — safe to unregister immediately
if (error instanceof Error) {
logger.warn('WORKER', `Failed to kill duplicate process PID ${existing.pid}, likely already dead`, { existingPid: existing.pid, sessionDbId }, error);
}
exited = true;
}
if (exited) {
unregisterProcess(existing.pid);
}
// If still alive, the 'exit' handler (line ~440) will unregister it.
}
getSupervisor().assertCanSpawn('claude sdk');
// On Windows, use cmd.exe wrapper for .cmd files to properly handle paths with spaces
const useCmdWrapper = process.platform === 'win32' && spawnOptions.command.endsWith('.cmd');
const env = sanitizeEnv(spawnOptions.env ?? process.env);
// Filter empty string args AND their preceding flag (Issue #2049).
// The Agent SDK emits ["--setting-sources", ""] when settingSources defaults to [].
// Simply dropping "" leaves an orphan --setting-sources that consumes the next
// flag (e.g. --permission-mode) as its value, crashing Claude Code 2.1.109+ with
// "Invalid setting source: --permission-mode". Drop the flag too so the SDK
// default (no setting sources) is preserved by omission.
const args: string[] = [];
for (const arg of spawnOptions.args) {
if (arg === '') {
if (args.length > 0 && args[args.length - 1].startsWith('--')) {
args.pop();
}
continue;
}
args.push(arg);
}
const child = useCmdWrapper
? spawn('cmd.exe', ['/d', '/c', spawnOptions.command, ...args], {
cwd: spawnOptions.cwd,
env,
stdio: ['pipe', 'pipe', 'pipe'],
signal: spawnOptions.signal,
windowsHide: true
})
: spawn(spawnOptions.command, args, {
cwd: spawnOptions.cwd,
env,
stdio: ['pipe', 'pipe', 'pipe'],
signal: spawnOptions.signal, // CRITICAL: Pass signal for AbortController integration
windowsHide: true
});
// Capture stderr for debugging spawn failures
if (child.stderr) {
child.stderr.on('data', (data: Buffer) => {
logger.debug('SDK_SPAWN', `[session-${sessionDbId}] stderr: ${data.toString().trim()}`);
});
}
// Register PID
if (child.pid) {
registerProcess(child.pid, sessionDbId, child);
// Auto-unregister on exit
child.on('exit', (code: number | null, signal: string | null) => {
if (code !== 0) {
logger.warn('SDK_SPAWN', `[session-${sessionDbId}] Claude process exited`, { code, signal, pid: child.pid });
}
if (child.pid) {
unregisterProcess(child.pid);
}
});
}
// Return SDK-compatible interface
return {
stdin: child.stdin,
stdout: child.stdout,
stderr: child.stderr,
get killed() { return child.killed; },
get exitCode() { return child.exitCode; },
kill: child.kill.bind(child),
on: child.on.bind(child),
once: child.once.bind(child),
off: child.off.bind(child)
};
};
}
/**
* Start the orphan reaper interval
* Returns cleanup function to stop the interval
*/
export function startOrphanReaper(getActiveSessionIds: () => Set<number>, intervalMs: number = 30 * 1000): () => void {
const interval = setInterval(async () => {
try {
const activeIds = getActiveSessionIds();
const killed = await reapOrphanedProcesses(activeIds);
if (killed > 0) {
logger.info('PROCESS', `Reaper cleaned up ${killed} orphaned processes`, { killed });
}
} catch (error) {
if (error instanceof Error) {
logger.error('WORKER', 'Reaper error', {}, error);
} else {
logger.error('WORKER', 'Reaper error', { rawError: String(error) });
}
}
}, intervalMs);
// Return cleanup function
return () => clearInterval(interval);
}