mirror of
https://github.com/thedotmack/claude-mem
synced 2026-04-25 17:15:04 +02:00
Systematic cleanup of every error handling anti-pattern detected by the automated scanner. 289 issues fixed via code changes, 12 approved with specific technical justifications. Changes across 90 files: - GENERIC_CATCH (141): Added instanceof Error type discrimination - LARGE_TRY_BLOCK (82): Extracted helper methods to narrow try scope to ≤10 lines - NO_LOGGING_IN_CATCH (65): Added logger/console calls for error visibility - CATCH_AND_CONTINUE_CRITICAL_PATH (10): Added throw/return or approved overrides - ERROR_STRING_MATCHING (2): Approved with rationale (no typed error classes) - ERROR_MESSAGE_GUESSING (1): Replaced chained .includes() with documented pattern array - PROMISE_CATCH_NO_LOGGING (1): Added logging to .catch() handler Also fixes a detector bug where nested try/catch inside a catch block corrupted brace-depth tracking, causing false positives. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
512 lines
16 KiB
TypeScript
512 lines
16 KiB
TypeScript
/**
|
|
* ProcessRegistry: Track spawned Claude subprocesses
|
|
*
|
|
* Fixes Issue #737: Claude haiku subprocesses don't terminate properly,
|
|
* causing zombie process accumulation (user reported 155 processes / 51GB RAM).
|
|
*
|
|
* Root causes:
|
|
* 1. SDK's SpawnedProcess interface hides subprocess PIDs
|
|
* 2. deleteSession() doesn't verify subprocess exit before cleanup
|
|
* 3. abort() is fire-and-forget with no confirmation
|
|
*
|
|
* Solution:
|
|
* - Use SDK's spawnClaudeCodeProcess option to capture PIDs
|
|
* - Track all spawned processes with session association
|
|
* - Verify exit on session deletion with timeout + SIGKILL escalation
|
|
* - Safety net orphan reaper runs every 5 minutes
|
|
*/
|
|
|
|
import { spawn, exec, ChildProcess } from 'child_process';
|
|
import { promisify } from 'util';
|
|
import { logger } from '../../utils/logger.js';
|
|
import { sanitizeEnv } from '../../supervisor/env-sanitizer.js';
|
|
import { getSupervisor } from '../../supervisor/index.js';
|
|
|
|
const execAsync = promisify(exec);
|
|
|
|
interface TrackedProcess {
|
|
pid: number;
|
|
sessionDbId: number;
|
|
spawnedAt: number;
|
|
process: ChildProcess;
|
|
}
|
|
|
|
function getTrackedProcesses(): TrackedProcess[] {
|
|
return getSupervisor().getRegistry()
|
|
.getAll()
|
|
.filter(record => record.type === 'sdk')
|
|
.map((record) => {
|
|
const processRef = getSupervisor().getRegistry().getRuntimeProcess(record.id);
|
|
if (!processRef) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
pid: record.pid,
|
|
sessionDbId: Number(record.sessionId),
|
|
spawnedAt: Date.parse(record.startedAt),
|
|
process: processRef
|
|
};
|
|
})
|
|
.filter((value): value is TrackedProcess => value !== null);
|
|
}
|
|
|
|
/**
|
|
* Register a spawned process in the registry
|
|
*/
|
|
export function registerProcess(pid: number, sessionDbId: number, process: ChildProcess): void {
|
|
getSupervisor().registerProcess(`sdk:${sessionDbId}:${pid}`, {
|
|
pid,
|
|
type: 'sdk',
|
|
sessionId: sessionDbId,
|
|
startedAt: new Date().toISOString()
|
|
}, process);
|
|
logger.info('PROCESS', `Registered PID ${pid} for session ${sessionDbId}`, { pid, sessionDbId });
|
|
}
|
|
|
|
/**
|
|
* Unregister a process from the registry and notify pool waiters
|
|
*/
|
|
export function unregisterProcess(pid: number): void {
|
|
for (const record of getSupervisor().getRegistry().getByPid(pid)) {
|
|
if (record.type === 'sdk') {
|
|
getSupervisor().unregisterProcess(record.id);
|
|
}
|
|
}
|
|
logger.debug('PROCESS', `Unregistered PID ${pid}`, { pid });
|
|
// Notify waiters that a pool slot may be available
|
|
notifySlotAvailable();
|
|
}
|
|
|
|
/**
|
|
* Get process info by session ID
|
|
* Warns if multiple processes found (indicates race condition)
|
|
*/
|
|
export function getProcessBySession(sessionDbId: number): TrackedProcess | undefined {
|
|
const matches = getTrackedProcesses().filter(info => info.sessionDbId === sessionDbId);
|
|
if (matches.length > 1) {
|
|
logger.warn('PROCESS', `Multiple processes found for session ${sessionDbId}`, {
|
|
count: matches.length,
|
|
pids: matches.map(m => m.pid)
|
|
});
|
|
}
|
|
return matches[0];
|
|
}
|
|
|
|
/**
|
|
* Get count of active processes in the registry
|
|
*/
|
|
export function getActiveCount(): number {
|
|
return getSupervisor().getRegistry().getAll().filter(record => record.type === 'sdk').length;
|
|
}
|
|
|
|
// Waiters for pool slots - resolved when a process exits and frees a slot
|
|
const slotWaiters: Array<() => void> = [];
|
|
|
|
/**
|
|
* Notify waiters that a slot has freed up
|
|
*/
|
|
function notifySlotAvailable(): void {
|
|
const waiter = slotWaiters.shift();
|
|
if (waiter) waiter();
|
|
}
|
|
|
|
/**
|
|
* Wait for a pool slot to become available (promise-based, not polling)
|
|
* @param maxConcurrent Max number of concurrent agents
|
|
* @param timeoutMs Max time to wait before giving up
|
|
*/
|
|
const TOTAL_PROCESS_HARD_CAP = 10;
|
|
|
|
export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_000): Promise<void> {
|
|
// Hard cap: refuse to spawn if too many processes exist regardless of pool accounting
|
|
const activeCount = getActiveCount();
|
|
if (activeCount >= TOTAL_PROCESS_HARD_CAP) {
|
|
throw new Error(`Hard cap exceeded: ${activeCount} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`);
|
|
}
|
|
|
|
if (activeCount < maxConcurrent) return;
|
|
|
|
logger.info('PROCESS', `Pool limit reached (${activeCount}/${maxConcurrent}), waiting for slot...`);
|
|
|
|
return new Promise<void>((resolve, reject) => {
|
|
const timeout = setTimeout(() => {
|
|
const idx = slotWaiters.indexOf(onSlot);
|
|
if (idx >= 0) slotWaiters.splice(idx, 1);
|
|
reject(new Error(`Timed out waiting for agent pool slot after ${timeoutMs}ms`));
|
|
}, timeoutMs);
|
|
|
|
const onSlot = () => {
|
|
clearTimeout(timeout);
|
|
if (getActiveCount() < maxConcurrent) {
|
|
resolve();
|
|
} else {
|
|
// Still full, re-queue
|
|
slotWaiters.push(onSlot);
|
|
}
|
|
};
|
|
|
|
slotWaiters.push(onSlot);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Get all active PIDs (for debugging)
|
|
*/
|
|
export function getActiveProcesses(): Array<{ pid: number; sessionDbId: number; ageMs: number }> {
|
|
const now = Date.now();
|
|
return getTrackedProcesses().map(info => ({
|
|
pid: info.pid,
|
|
sessionDbId: info.sessionDbId,
|
|
ageMs: now - info.spawnedAt
|
|
}));
|
|
}
|
|
|
|
/**
|
|
* Wait for a process to exit with timeout, escalating to SIGKILL if needed
|
|
* Uses event-based waiting instead of polling to avoid CPU overhead
|
|
*/
|
|
export async function ensureProcessExit(tracked: TrackedProcess, timeoutMs: number = 5000): Promise<void> {
|
|
const { pid, process: proc } = tracked;
|
|
|
|
// Already exited? Only trust exitCode, NOT proc.killed
|
|
// proc.killed only means Node sent a signal — the process can still be alive
|
|
if (proc.exitCode !== null) {
|
|
unregisterProcess(pid);
|
|
return;
|
|
}
|
|
|
|
// Wait for graceful exit with timeout using event-based approach
|
|
const exitPromise = new Promise<void>((resolve) => {
|
|
proc.once('exit', () => resolve());
|
|
});
|
|
|
|
const timeoutPromise = new Promise<void>((resolve) => {
|
|
setTimeout(resolve, timeoutMs);
|
|
});
|
|
|
|
await Promise.race([exitPromise, timeoutPromise]);
|
|
|
|
// Check if exited gracefully — only trust exitCode
|
|
if (proc.exitCode !== null) {
|
|
unregisterProcess(pid);
|
|
return;
|
|
}
|
|
|
|
// Timeout: escalate to SIGKILL
|
|
logger.warn('PROCESS', `PID ${pid} did not exit after ${timeoutMs}ms, sending SIGKILL`, { pid, timeoutMs });
|
|
try {
|
|
proc.kill('SIGKILL');
|
|
} catch {
|
|
// Already dead
|
|
}
|
|
|
|
// Wait for SIGKILL to take effect — use exit event with 1s timeout instead of blind sleep
|
|
const sigkillExitPromise = new Promise<void>((resolve) => {
|
|
proc.once('exit', () => resolve());
|
|
});
|
|
const sigkillTimeout = new Promise<void>((resolve) => {
|
|
setTimeout(resolve, 1000);
|
|
});
|
|
await Promise.race([sigkillExitPromise, sigkillTimeout]);
|
|
unregisterProcess(pid);
|
|
}
|
|
|
|
/**
|
|
* Kill idle daemon children (claude processes spawned by worker-service)
|
|
*
|
|
* These are SDK-spawned claude processes that completed their work but
|
|
* didn't terminate properly. They remain as children of the worker-service
|
|
* daemon, consuming memory without doing useful work.
|
|
*
|
|
* Criteria for cleanup:
|
|
* - Process name is "claude"
|
|
* - Parent PID is the worker-service daemon (this process)
|
|
* - Process has 0% CPU (idle)
|
|
* - Process has been running for more than 2 minutes
|
|
*/
|
|
async function killIdleDaemonChildren(): Promise<number> {
|
|
if (process.platform === 'win32') {
|
|
// Windows: Different process model, skip for now
|
|
return 0;
|
|
}
|
|
|
|
const daemonPid = process.pid;
|
|
let killed = 0;
|
|
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
'ps -eo pid,ppid,%cpu,etime,comm 2>/dev/null | grep "claude$" || true'
|
|
);
|
|
|
|
for (const line of stdout.trim().split('\n')) {
|
|
if (!line) continue;
|
|
|
|
const parts = line.trim().split(/\s+/);
|
|
if (parts.length < 5) continue;
|
|
|
|
const [pidStr, ppidStr, cpuStr, etime] = parts;
|
|
const pid = parseInt(pidStr, 10);
|
|
const ppid = parseInt(ppidStr, 10);
|
|
const cpu = parseFloat(cpuStr);
|
|
|
|
// Skip if not a child of this daemon
|
|
if (ppid !== daemonPid) continue;
|
|
|
|
// Skip if actively using CPU
|
|
if (cpu > 0) continue;
|
|
|
|
// Parse elapsed time to minutes
|
|
// Formats: MM:SS, HH:MM:SS, D-HH:MM:SS
|
|
let minutes = 0;
|
|
const dayMatch = etime.match(/^(\d+)-(\d+):(\d+):(\d+)$/);
|
|
const hourMatch = etime.match(/^(\d+):(\d+):(\d+)$/);
|
|
const minMatch = etime.match(/^(\d+):(\d+)$/);
|
|
|
|
if (dayMatch) {
|
|
minutes = parseInt(dayMatch[1], 10) * 24 * 60 +
|
|
parseInt(dayMatch[2], 10) * 60 +
|
|
parseInt(dayMatch[3], 10);
|
|
} else if (hourMatch) {
|
|
minutes = parseInt(hourMatch[1], 10) * 60 +
|
|
parseInt(hourMatch[2], 10);
|
|
} else if (minMatch) {
|
|
minutes = parseInt(minMatch[1], 10);
|
|
}
|
|
|
|
// Kill if idle for more than 1 minute
|
|
if (minutes >= 1) {
|
|
logger.info('PROCESS', `Killing idle daemon child PID ${pid} (idle ${minutes}m)`, { pid, minutes });
|
|
try {
|
|
process.kill(pid, 'SIGKILL');
|
|
killed++;
|
|
} catch {
|
|
// Already dead or permission denied
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
// No matches or command error
|
|
}
|
|
|
|
return killed;
|
|
}
|
|
|
|
/**
|
|
* Kill system-level orphans (ppid=1 on Unix)
|
|
* These are Claude processes whose parent died unexpectedly
|
|
*/
|
|
async function killSystemOrphans(): Promise<number> {
|
|
if (process.platform === 'win32') {
|
|
return 0; // Windows doesn't have ppid=1 orphan concept
|
|
}
|
|
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
'ps -eo pid,ppid,args 2>/dev/null | grep -E "claude.*haiku|claude.*output-format" | grep -v grep'
|
|
);
|
|
|
|
let killed = 0;
|
|
for (const line of stdout.trim().split('\n')) {
|
|
if (!line) continue;
|
|
const match = line.trim().match(/^(\d+)\s+(\d+)/);
|
|
if (match && parseInt(match[2]) === 1) { // ppid=1 = orphan
|
|
const orphanPid = parseInt(match[1]);
|
|
logger.warn('PROCESS', `Killing system orphan PID ${orphanPid}`, { pid: orphanPid });
|
|
try {
|
|
process.kill(orphanPid, 'SIGKILL');
|
|
killed++;
|
|
} catch {
|
|
// Already dead or permission denied
|
|
}
|
|
}
|
|
}
|
|
return killed;
|
|
} catch {
|
|
return 0; // No matches or error
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reap orphaned processes - both registry-tracked and system-level
|
|
*/
|
|
export async function reapOrphanedProcesses(activeSessionIds: Set<number>): Promise<number> {
|
|
let killed = 0;
|
|
|
|
// Registry-based: kill processes for dead sessions
|
|
for (const record of getSupervisor().getRegistry().getAll().filter(entry => entry.type === 'sdk')) {
|
|
const pid = record.pid;
|
|
const sessionDbId = Number(record.sessionId);
|
|
const processRef = getSupervisor().getRegistry().getRuntimeProcess(record.id);
|
|
|
|
if (activeSessionIds.has(sessionDbId)) continue; // Active = safe
|
|
|
|
logger.warn('PROCESS', `Killing orphan PID ${pid} (session ${sessionDbId} gone)`, { pid, sessionDbId });
|
|
try {
|
|
if (processRef) {
|
|
processRef.kill('SIGKILL');
|
|
} else {
|
|
process.kill(pid, 'SIGKILL');
|
|
}
|
|
killed++;
|
|
} catch {
|
|
// Already dead
|
|
}
|
|
getSupervisor().unregisterProcess(record.id);
|
|
notifySlotAvailable();
|
|
}
|
|
|
|
// System-level: find ppid=1 orphans
|
|
killed += await killSystemOrphans();
|
|
|
|
// Daemon children: find idle SDK processes that didn't terminate
|
|
killed += await killIdleDaemonChildren();
|
|
|
|
return killed;
|
|
}
|
|
|
|
/**
|
|
* Create a custom spawn function for SDK that captures PIDs
|
|
*
|
|
* The SDK's spawnClaudeCodeProcess option allows us to intercept subprocess
|
|
* creation and capture the PID before the SDK hides it.
|
|
*
|
|
* NOTE: Session isolation is handled via the `cwd` option in SDKAgent.ts,
|
|
* NOT via CLAUDE_CONFIG_DIR (which breaks authentication).
|
|
*/
|
|
export function createPidCapturingSpawn(sessionDbId: number) {
|
|
return (spawnOptions: {
|
|
command: string;
|
|
args: string[];
|
|
cwd?: string;
|
|
env?: NodeJS.ProcessEnv;
|
|
signal?: AbortSignal;
|
|
}) => {
|
|
// Kill any existing process for this session before spawning a new one.
|
|
// Multiple processes sharing the same --resume UUID waste API credits and
|
|
// can conflict with each other (Issue #1590).
|
|
const existing = getProcessBySession(sessionDbId);
|
|
if (existing && existing.process.exitCode === null) {
|
|
logger.warn('PROCESS', `Killing duplicate process PID ${existing.pid} before spawning new one for session ${sessionDbId}`, {
|
|
existingPid: existing.pid,
|
|
sessionDbId
|
|
});
|
|
let exited = false;
|
|
try {
|
|
existing.process.kill('SIGTERM');
|
|
exited = existing.process.exitCode !== null;
|
|
} catch (error: unknown) {
|
|
// Already dead — safe to unregister immediately
|
|
if (error instanceof Error) {
|
|
logger.warn('WORKER', `Failed to kill duplicate process PID ${existing.pid}, likely already dead`, { existingPid: existing.pid, sessionDbId }, error);
|
|
}
|
|
exited = true;
|
|
}
|
|
|
|
if (exited) {
|
|
unregisterProcess(existing.pid);
|
|
}
|
|
// If still alive, the 'exit' handler (line ~440) will unregister it.
|
|
}
|
|
|
|
getSupervisor().assertCanSpawn('claude sdk');
|
|
|
|
// On Windows, use cmd.exe wrapper for .cmd files to properly handle paths with spaces
|
|
const useCmdWrapper = process.platform === 'win32' && spawnOptions.command.endsWith('.cmd');
|
|
const env = sanitizeEnv(spawnOptions.env ?? process.env);
|
|
|
|
// Filter empty string args AND their preceding flag (Issue #2049).
|
|
// The Agent SDK emits ["--setting-sources", ""] when settingSources defaults to [].
|
|
// Simply dropping "" leaves an orphan --setting-sources that consumes the next
|
|
// flag (e.g. --permission-mode) as its value, crashing Claude Code 2.1.109+ with
|
|
// "Invalid setting source: --permission-mode". Drop the flag too so the SDK
|
|
// default (no setting sources) is preserved by omission.
|
|
const args: string[] = [];
|
|
for (const arg of spawnOptions.args) {
|
|
if (arg === '') {
|
|
if (args.length > 0 && args[args.length - 1].startsWith('--')) {
|
|
args.pop();
|
|
}
|
|
continue;
|
|
}
|
|
args.push(arg);
|
|
}
|
|
|
|
const child = useCmdWrapper
|
|
? spawn('cmd.exe', ['/d', '/c', spawnOptions.command, ...args], {
|
|
cwd: spawnOptions.cwd,
|
|
env,
|
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
signal: spawnOptions.signal,
|
|
windowsHide: true
|
|
})
|
|
: spawn(spawnOptions.command, args, {
|
|
cwd: spawnOptions.cwd,
|
|
env,
|
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
signal: spawnOptions.signal, // CRITICAL: Pass signal for AbortController integration
|
|
windowsHide: true
|
|
});
|
|
|
|
// Capture stderr for debugging spawn failures
|
|
if (child.stderr) {
|
|
child.stderr.on('data', (data: Buffer) => {
|
|
logger.debug('SDK_SPAWN', `[session-${sessionDbId}] stderr: ${data.toString().trim()}`);
|
|
});
|
|
}
|
|
|
|
// Register PID
|
|
if (child.pid) {
|
|
registerProcess(child.pid, sessionDbId, child);
|
|
|
|
// Auto-unregister on exit
|
|
child.on('exit', (code: number | null, signal: string | null) => {
|
|
if (code !== 0) {
|
|
logger.warn('SDK_SPAWN', `[session-${sessionDbId}] Claude process exited`, { code, signal, pid: child.pid });
|
|
}
|
|
if (child.pid) {
|
|
unregisterProcess(child.pid);
|
|
}
|
|
});
|
|
}
|
|
|
|
// Return SDK-compatible interface
|
|
return {
|
|
stdin: child.stdin,
|
|
stdout: child.stdout,
|
|
stderr: child.stderr,
|
|
get killed() { return child.killed; },
|
|
get exitCode() { return child.exitCode; },
|
|
kill: child.kill.bind(child),
|
|
on: child.on.bind(child),
|
|
once: child.once.bind(child),
|
|
off: child.off.bind(child)
|
|
};
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Start the orphan reaper interval
|
|
* Returns cleanup function to stop the interval
|
|
*/
|
|
export function startOrphanReaper(getActiveSessionIds: () => Set<number>, intervalMs: number = 30 * 1000): () => void {
|
|
const interval = setInterval(async () => {
|
|
try {
|
|
const activeIds = getActiveSessionIds();
|
|
const killed = await reapOrphanedProcesses(activeIds);
|
|
if (killed > 0) {
|
|
logger.info('PROCESS', `Reaper cleaned up ${killed} orphaned processes`, { killed });
|
|
}
|
|
} catch (error) {
|
|
if (error instanceof Error) {
|
|
logger.error('WORKER', 'Reaper error', {}, error);
|
|
} else {
|
|
logger.error('WORKER', 'Reaper error', { rawError: String(error) });
|
|
}
|
|
}
|
|
}, intervalMs);
|
|
|
|
// Return cleanup function
|
|
return () => clearInterval(interval);
|
|
}
|