fix: hook resilience and worker lifecycle improvements (#957, #923, #984, #987, #1042)

Reduce timeouts to eliminate 10-30s startup delay when worker is dead
(common on WSL2 after hibernate). Add stale PID detection, graceful
error handling across all handlers, and error classification that
distinguishes worker unavailability from handler bugs.

- HEALTH_CHECK 30s→3s, new POST_SPAWN_WAIT (5s), PORT_IN_USE_WAIT (3s)
- isProcessAlive() with EPERM handling, cleanStalePidFile()
- getPluginVersion() try-catch for shutdown race (#1042)
- isWorkerUnavailableError: transport+5xx+429→exit 0, 4xx→exit 2
- No-op handler for unknown event types (#984)
- Wrap all handler fetch calls in try-catch for graceful degradation
- CLAUDE_MEM_HEALTH_TIMEOUT_MS env var override with validation
This commit is contained in:
Rod Boev
2026-02-10 15:34:35 -05:00
parent 6ac5507e4e
commit 418e38ee46
16 changed files with 791 additions and 348 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -9,6 +9,7 @@ import type { EventHandler, NormalizedHookInput, HookResult } from '../types.js'
import { ensureWorkerRunning, getWorkerPort } from '../../shared/worker-utils.js';
import { getProjectContext } from '../../utils/project-name.js';
import { HOOK_EXIT_CODES } from '../../shared/hook-constants.js';
import { logger } from '../../utils/logger.js';
export const contextHandler: EventHandler = {
async execute(input: NormalizedHookInput): Promise<HookResult> {
@@ -35,20 +36,34 @@ export const contextHandler: EventHandler = {
// Note: Removed AbortSignal.timeout due to Windows Bun cleanup issue (libuv assertion)
// Worker service has its own timeouts, so client-side timeout is redundant
const response = await fetch(url);
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Context generation failed: ${response.status}`);
}
const result = await response.text();
const additionalContext = result.trim();
return {
hookSpecificOutput: {
hookEventName: 'SessionStart',
additionalContext
if (!response.ok) {
// Log but don't throw — context fetch failure should not block session start
logger.warn('HOOK', 'Context generation failed, returning empty', { status: response.status });
return {
hookSpecificOutput: { hookEventName: 'SessionStart', additionalContext: '' },
exitCode: HOOK_EXIT_CODES.SUCCESS
};
}
};
const result = await response.text();
const additionalContext = result.trim();
return {
hookSpecificOutput: {
hookEventName: 'SessionStart',
additionalContext
}
};
} catch (error) {
// Worker unreachable — return empty context gracefully
logger.warn('HOOK', 'Context fetch error, returning empty', { error: error instanceof Error ? error.message : String(error) });
return {
hookSpecificOutput: { hookEventName: 'SessionStart', additionalContext: '' },
exitCode: HOOK_EXIT_CODES.SUCCESS
};
}
}
};

View File

@@ -39,25 +39,33 @@ export const fileEditHandler: EventHandler = {
// Send to worker as an observation with file edit metadata
// The observation handler on the worker will process this appropriately
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: 'write_file',
tool_input: { filePath, edits },
tool_response: { success: true },
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
try {
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: 'write_file',
tool_input: { filePath, edits },
tool_response: { success: true },
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
if (!response.ok) {
throw new Error(`File edit observation storage failed: ${response.status}`);
if (!response.ok) {
// Log but don't throw — file edit observation failure should not block editing
logger.warn('HOOK', 'File edit observation storage failed, skipping', { status: response.status, filePath });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'File edit observation sent successfully', { filePath });
} catch (error) {
// Worker unreachable — skip file edit observation gracefully
logger.warn('HOOK', 'File edit observation fetch error, skipping', { error: error instanceof Error ? error.message : String(error) });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'File edit observation sent successfully', { filePath });
return { continue: true, suppressOutput: true };
}
};

View File

@@ -5,6 +5,7 @@
*/
import type { EventHandler } from '../types.js';
import { HOOK_EXIT_CODES } from '../../shared/hook-constants.js';
import { contextHandler } from './context.js';
import { sessionInitHandler } from './session-init.js';
import { observationHandler } from './observation.js';
@@ -35,14 +36,22 @@ const handlers: Record<EventType, EventHandler> = {
/**
* Get the event handler for a given event type.
*
* Returns a no-op handler for unknown event types instead of throwing (fix #984).
* Claude Code may send new event types that the plugin doesn't handle yet —
* throwing would surface as a BLOCKING_ERROR to the user.
*
* @param eventType The type of event to handle
* @returns The appropriate EventHandler
* @throws Error if event type is not recognized
* @returns The appropriate EventHandler, or a no-op handler for unknown types
*/
export function getEventHandler(eventType: EventType): EventHandler {
const handler = handlers[eventType];
export function getEventHandler(eventType: string): EventHandler {
const handler = handlers[eventType as EventType];
if (!handler) {
throw new Error(`Unknown event type: ${eventType}`);
console.error(`[claude-mem] Unknown event type: ${eventType}, returning no-op`);
return {
async execute() {
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
};
}
return handler;
}

View File

@@ -48,25 +48,33 @@ export const observationHandler: EventHandler = {
}
// Send to worker - worker handles privacy check and database operations
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: toolName,
tool_input: toolInput,
tool_response: toolResponse,
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
try {
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: toolName,
tool_input: toolInput,
tool_response: toolResponse,
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
if (!response.ok) {
throw new Error(`Observation storage failed: ${response.status}`);
if (!response.ok) {
// Log but don't throw — observation storage failure should not block tool use
logger.warn('HOOK', 'Observation storage failed, skipping', { status: response.status, toolName });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'Observation sent successfully', { toolName });
} catch (error) {
// Worker unreachable — skip observation gracefully
logger.warn('HOOK', 'Observation fetch error, skipping', { error: error instanceof Error ? error.message : String(error) });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'Observation sent successfully', { toolName });
return { continue: true, suppressOutput: true };
}
};

View File

@@ -16,7 +16,11 @@ import { logger } from '../../utils/logger.js';
export const sessionCompleteHandler: EventHandler = {
async execute(input: NormalizedHookInput): Promise<HookResult> {
// Ensure worker is running
await ensureWorkerRunning();
const workerReady = await ensureWorkerRunning();
if (!workerReady) {
// Worker not available — skip session completion gracefully
return { continue: true, suppressOutput: true };
}
const { sessionId } = input;
const port = getWorkerPort();

View File

@@ -13,37 +13,46 @@ import { HOOK_EXIT_CODES } from '../../shared/hook-constants.js';
export const userMessageHandler: EventHandler = {
async execute(input: NormalizedHookInput): Promise<HookResult> {
// Ensure worker is running
await ensureWorkerRunning();
const workerReady = await ensureWorkerRunning();
if (!workerReady) {
// Worker not available — skip user message gracefully
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
}
const port = getWorkerPort();
const project = basename(input.cwd ?? process.cwd());
// Fetch formatted context directly from worker API
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
const response = await fetch(
`http://127.0.0.1:${port}/api/context/inject?project=${encodeURIComponent(project)}&colors=true`,
{ method: 'GET' }
);
try {
const response = await fetch(
`http://127.0.0.1:${port}/api/context/inject?project=${encodeURIComponent(project)}&colors=true`,
{ method: 'GET' }
);
if (!response.ok) {
// Don't throw - context fetch failure should not block the user's prompt
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
if (!response.ok) {
// Don't throw - context fetch failure should not block the user's prompt
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
}
const output = await response.text();
// Write to stderr for user visibility
// Note: Using process.stderr.write instead of console.error to avoid
// Claude Code treating this as a hook error. The actual hook output
// goes to stdout via hook-command.ts JSON serialization.
process.stderr.write(
"\n\n" + String.fromCodePoint(0x1F4DD) + " Claude-Mem Context Loaded\n\n" +
output +
"\n\n" + String.fromCodePoint(0x1F4A1) + " Wrap any message with <private> ... </private> to prevent storing sensitive information.\n" +
"\n" + String.fromCodePoint(0x1F4AC) + " Community https://discord.gg/J4wttp9vDu" +
`\n` + String.fromCodePoint(0x1F4FA) + ` Watch live in browser http://localhost:${port}/\n`
);
} catch (error) {
// Worker unreachable — skip user message gracefully
// User message context error is non-critical — skip gracefully
}
const output = await response.text();
// Write to stderr for user visibility
// Note: Using process.stderr.write instead of console.error to avoid
// Claude Code treating this as a hook error. The actual hook output
// goes to stdout via hook-command.ts JSON serialization.
process.stderr.write(
"\n\n" + String.fromCodePoint(0x1F4DD) + " Claude-Mem Context Loaded\n\n" +
output +
"\n\n" + String.fromCodePoint(0x1F4A1) + " Wrap any message with <private> ... </private> to prevent storing sensitive information.\n" +
"\n" + String.fromCodePoint(0x1F4AC) + " Community https://discord.gg/J4wttp9vDu" +
`\n` + String.fromCodePoint(0x1F4FA) + ` Watch live in browser http://localhost:${port}/\n`
);
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
}
};

View File

@@ -8,6 +8,61 @@ export interface HookCommandOptions {
skipExit?: boolean;
}
/**
* Classify whether an error indicates the worker is unavailable (graceful degradation)
* vs a handler/client bug (blocking error that developers need to see).
*
* Exit 0 (graceful degradation):
* - Transport failures: ECONNREFUSED, ECONNRESET, EPIPE, ETIMEDOUT, fetch failed
* - Timeout errors: timed out, timeout
* - Server errors: HTTP 5xx status codes
*
* Exit 2 (blocking error — handler/client bug):
* - HTTP 4xx status codes (bad request, not found, validation error)
* - Programming errors (TypeError, ReferenceError, SyntaxError)
* - All other unexpected errors
*/
export function isWorkerUnavailableError(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
const lower = message.toLowerCase();
// Transport failures — worker unreachable
const transportPatterns = [
'econnrefused',
'econnreset',
'epipe',
'etimedout',
'enotfound',
'econnaborted',
'enetunreach',
'ehostunreach',
'fetch failed',
'unable to connect',
'socket hang up',
];
if (transportPatterns.some(p => lower.includes(p))) return true;
// Timeout errors — worker didn't respond in time
if (lower.includes('timed out') || lower.includes('timeout')) return true;
// HTTP 5xx server errors — worker has internal problems
if (/failed:\s*5\d{2}/.test(message) || /status[:\s]+5\d{2}/.test(message)) return true;
// HTTP 429 (rate limit) — treat as transient unavailability, not a bug
if (/failed:\s*429/.test(message) || /status[:\s]+429/.test(message)) return true;
// HTTP 4xx client errors — our bug, NOT worker unavailability
if (/failed:\s*4\d{2}/.test(message) || /status[:\s]+4\d{2}/.test(message)) return false;
// Programming errors — code bugs, not worker unavailability
if (error instanceof TypeError || error instanceof ReferenceError || error instanceof SyntaxError) {
return false;
}
// Default: treat unknown errors as blocking (conservative — surface bugs)
return false;
}
export async function hookCommand(platform: string, event: string, options: HookCommandOptions = {}): Promise<number> {
try {
const adapter = getPlatformAdapter(platform);
@@ -26,9 +81,17 @@ export async function hookCommand(platform: string, event: string, options: Hook
}
return exitCode;
} catch (error) {
if (isWorkerUnavailableError(error)) {
// Worker unavailable — degrade gracefully, don't block the user
console.error(`[claude-mem] Worker unavailable, skipping hook: ${error instanceof Error ? error.message : error}`);
if (!options.skipExit) {
process.exit(HOOK_EXIT_CODES.SUCCESS); // = 0 (graceful)
}
return HOOK_EXIT_CODES.SUCCESS;
}
// Handler/client bug — show as blocking error so developers see it
console.error(`Hook error: ${error}`);
// Use exit code 2 (blocking error) so users see the error message
// Exit code 1 only shows in verbose mode per Claude Code docs
if (!options.skipExit) {
process.exit(HOOK_EXIT_CODES.BLOCKING_ERROR); // = 2
}

View File

@@ -77,7 +77,11 @@ export function removePidFile(): void {
}
/**
* Get platform-adjusted timeout (Windows socket cleanup is slower)
* Get platform-adjusted timeout for worker-side socket operations (2.0x on Windows).
*
* Note: Two platform multiplier functions exist intentionally:
* - getTimeout() in hook-constants.ts uses 1.5x for hook-side operations (fast path)
* - getPlatformTimeout() here uses 2.0x for worker-side socket operations (slower path)
*/
export function getPlatformTimeout(baseMs: number): number {
const WINDOWS_MULTIPLIER = 2.0;
@@ -398,6 +402,56 @@ export function spawnDaemon(
return child.pid;
}
/**
* Check if a process with the given PID is alive.
*
* Uses the process.kill(pid, 0) idiom: signal 0 doesn't send a signal,
* it just checks if the process exists and is reachable.
*
* EPERM is treated as "alive" because it means the process exists but
* belongs to a different user/session (common in multi-user setups).
* PID 0 (Windows WMIC sentinel for unknown PID) is treated as alive.
*/
export function isProcessAlive(pid: number): boolean {
// PID 0 is the Windows WMIC sentinel value — process was spawned but PID unknown
if (pid === 0) return true;
// Invalid PIDs are not alive
if (!Number.isInteger(pid) || pid < 0) return false;
try {
process.kill(pid, 0);
return true;
} catch (error: unknown) {
const code = (error as NodeJS.ErrnoException).code;
// EPERM = process exists but different user/session — treat as alive
if (code === 'EPERM') return true;
// ESRCH = no such process — it's dead
return false;
}
}
/**
* Read the PID file and remove it if the recorded process is dead (stale).
*
* This is a cheap operation: one filesystem read + one signal-0 check.
* Called at the top of ensureWorkerStarted() to clean up after WSL2
* hibernate, OOM kills, or other ungraceful worker deaths.
*/
export function cleanStalePidFile(): void {
const pidInfo = readPidFile();
if (!pidInfo) return;
if (!isProcessAlive(pidInfo.pid)) {
logger.info('SYSTEM', 'Removing stale PID file (worker process is dead)', {
pid: pidInfo.pid,
port: pidInfo.port,
startedAt: pidInfo.startedAt
});
removePidFile();
}
}
/**
* Create signal handler factory for graceful shutdown
* Returns a handler function that can be passed to process.on('SIGTERM') etc.

View File

@@ -14,6 +14,7 @@ import { existsSync, writeFileSync, unlinkSync, statSync } from 'fs';
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
import { getWorkerPort, getWorkerHost } from '../shared/worker-utils.js';
import { HOOK_TIMEOUTS } from '../shared/hook-constants.js';
import { SettingsDefaultsManager } from '../shared/SettingsDefaultsManager.js';
import { logger } from '../utils/logger.js';
@@ -66,6 +67,7 @@ import {
removePidFile,
getPlatformTimeout,
cleanupOrphanedProcesses,
cleanStalePidFile,
spawnDaemon,
createSignalHandler
} from './infrastructure/ProcessManager.js';
@@ -746,6 +748,9 @@ export class WorkerService {
* @returns true if worker is healthy (existing or newly started), false on failure
*/
async function ensureWorkerStarted(port: number): Promise<boolean> {
// Clean stale PID file first (cheap: 1 fs read + 1 signal-0 check)
cleanStalePidFile();
// Check if worker is already running and healthy
if (await waitForHealth(port, 1000)) {
const versionCheck = await checkVersionMatch(port);
@@ -756,7 +761,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
});
await httpShutdown(port);
const freed = await waitForPortFree(port, getPlatformTimeout(15000));
const freed = await waitForPortFree(port, getPlatformTimeout(HOOK_TIMEOUTS.PORT_IN_USE_WAIT));
if (!freed) {
logger.error('SYSTEM', 'Port did not free up after shutdown for version mismatch restart', { port });
return false;
@@ -772,7 +777,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
const portInUse = await isPortInUse(port);
if (portInUse) {
logger.info('SYSTEM', 'Port in use, waiting for worker to become healthy');
const healthy = await waitForHealth(port, getPlatformTimeout(15000));
const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.PORT_IN_USE_WAIT));
if (healthy) {
logger.info('SYSTEM', 'Worker is now healthy');
return true;
@@ -799,7 +804,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
// PID file is written by the worker itself after listen() succeeds
// This is race-free and works correctly on Windows where cmd.exe PID is useless
const healthy = await waitForHealth(port, getPlatformTimeout(30000));
const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT));
if (!healthy) {
removePidFile();
logger.error('SYSTEM', 'Worker failed to start (health check timeout)');
@@ -871,7 +876,7 @@ async function main() {
// PID file is written by the worker itself after listen() succeeds
// This is race-free and works correctly on Windows where cmd.exe PID is useless
const healthy = await waitForHealth(port, getPlatformTimeout(30000));
const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT));
if (!healthy) {
removePidFile();
logger.error('SYSTEM', 'Worker failed to restart');

View File

@@ -1,11 +1,12 @@
export const HOOK_TIMEOUTS = {
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
HEALTH_CHECK: 30000, // Worker health check (30s for slow systems)
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
POST_SPAWN_WAIT: 5000, // Wait for daemon to start after spawn (starts in <1s on Linux)
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
WORKER_STARTUP_WAIT: 1000,
WORKER_STARTUP_RETRIES: 300,
PRE_RESTART_SETTLE_DELAY: 2000, // Give files time to sync before restart
POWERSHELL_COMMAND: 10000, // PowerShell process enumeration (10s - typically completes in <1s)
WINDOWS_MULTIPLIER: 1.5 // Platform-specific adjustment
WINDOWS_MULTIPLIER: 1.5 // Platform-specific adjustment for hook-side operations
} as const;
/**

View File

@@ -6,7 +6,21 @@ import { SettingsDefaultsManager } from "./SettingsDefaultsManager.js";
import { MARKETPLACE_ROOT } from "./paths.js";
// Named constants for health checks
const HEALTH_CHECK_TIMEOUT_MS = getTimeout(HOOK_TIMEOUTS.HEALTH_CHECK);
// Allow env var override for users on slow systems (e.g., CLAUDE_MEM_HEALTH_TIMEOUT_MS=10000)
const HEALTH_CHECK_TIMEOUT_MS = (() => {
const envVal = process.env.CLAUDE_MEM_HEALTH_TIMEOUT_MS;
if (envVal) {
const parsed = parseInt(envVal, 10);
if (Number.isFinite(parsed) && parsed >= 500 && parsed <= 300000) {
return parsed;
}
// Invalid env var — log once and use default
logger.warn('SYSTEM', 'Invalid CLAUDE_MEM_HEALTH_TIMEOUT_MS, using default', {
value: envVal, min: 500, max: 300000
});
}
return getTimeout(HOOK_TIMEOUTS.HEALTH_CHECK);
})();
/**
* Fetch with a timeout using Promise.race instead of AbortSignal.
@@ -89,12 +103,22 @@ async function isWorkerHealthy(): Promise<boolean> {
}
/**
* Get the current plugin version from package.json
* Get the current plugin version from package.json.
* Returns 'unknown' on ENOENT/EBUSY (shutdown race condition, fix #1042).
*/
function getPluginVersion(): string {
const packageJsonPath = path.join(MARKETPLACE_ROOT, 'package.json');
const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
return packageJson.version;
try {
const packageJsonPath = path.join(MARKETPLACE_ROOT, 'package.json');
const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
return packageJson.version;
} catch (error: unknown) {
const code = (error as NodeJS.ErrnoException).code;
if (code === 'ENOENT' || code === 'EBUSY') {
logger.debug('SYSTEM', 'Could not read plugin version (shutdown race)', { code });
return 'unknown';
}
throw error;
}
}
/**
@@ -115,18 +139,33 @@ async function getWorkerVersion(): Promise<string> {
/**
* Check if worker version matches plugin version
* Note: Auto-restart on version mismatch is now handled in worker-service.ts start command (issue #484)
* This function logs for informational purposes only
* This function logs for informational purposes only.
* Skips comparison when either version is 'unknown' (fix #1042 — avoids restart loops).
*/
async function checkWorkerVersion(): Promise<void> {
const pluginVersion = getPluginVersion();
const workerVersion = await getWorkerVersion();
try {
const pluginVersion = getPluginVersion();
if (pluginVersion !== workerVersion) {
// Just log debug info - auto-restart handles the mismatch in worker-service.ts
logger.debug('SYSTEM', 'Version check', {
pluginVersion,
workerVersion,
note: 'Mismatch will be auto-restarted by worker-service start command'
// Skip version check if plugin version couldn't be read (shutdown race)
if (pluginVersion === 'unknown') return;
const workerVersion = await getWorkerVersion();
// Skip version check if worker version is 'unknown' (avoids restart loops)
if (workerVersion === 'unknown') return;
if (pluginVersion !== workerVersion) {
// Just log debug info - auto-restart handles the mismatch in worker-service.ts
logger.debug('SYSTEM', 'Version check', {
pluginVersion,
workerVersion,
note: 'Mismatch will be auto-restarted by worker-service start command'
});
}
} catch (error) {
// Version check is informational — don't fail the hook
logger.debug('SYSTEM', 'Version check failed', {
error: error instanceof Error ? error.message : String(error)
});
}
}

164
tests/hook-command.test.ts Normal file
View File

@@ -0,0 +1,164 @@
/**
* Tests for hook-command error classifier
*
* Validates that isWorkerUnavailableError correctly distinguishes between:
* - Transport failures (ECONNREFUSED, etc.) → true (graceful degradation)
* - Server errors (5xx) → true (graceful degradation)
* - Client errors (4xx) → false (handler bug, blocking)
* - Programming errors (TypeError, etc.) → false (code bug, blocking)
*/
import { describe, it, expect } from 'bun:test';
import { isWorkerUnavailableError } from '../src/cli/hook-command.js';
describe('isWorkerUnavailableError', () => {
describe('transport failures → true (graceful)', () => {
it('should classify ECONNREFUSED as worker unavailable', () => {
const error = new Error('connect ECONNREFUSED 127.0.0.1:37777');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify ECONNRESET as worker unavailable', () => {
const error = new Error('socket hang up ECONNRESET');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify EPIPE as worker unavailable', () => {
const error = new Error('write EPIPE');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify ETIMEDOUT as worker unavailable', () => {
const error = new Error('connect ETIMEDOUT 127.0.0.1:37777');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify "fetch failed" as worker unavailable', () => {
const error = new TypeError('fetch failed');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify "Unable to connect" as worker unavailable', () => {
const error = new Error('Unable to connect to server');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify ENOTFOUND as worker unavailable', () => {
const error = new Error('getaddrinfo ENOTFOUND localhost');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify "socket hang up" as worker unavailable', () => {
const error = new Error('socket hang up');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify ECONNABORTED as worker unavailable', () => {
const error = new Error('ECONNABORTED');
expect(isWorkerUnavailableError(error)).toBe(true);
});
});
describe('timeout errors → true (graceful)', () => {
it('should classify "timed out" as worker unavailable', () => {
const error = new Error('Request timed out after 3000ms');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify "timeout" as worker unavailable', () => {
const error = new Error('Connection timeout');
expect(isWorkerUnavailableError(error)).toBe(true);
});
});
describe('HTTP 5xx server errors → true (graceful)', () => {
it('should classify 500 status as worker unavailable', () => {
const error = new Error('Context generation failed: 500');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify 502 status as worker unavailable', () => {
const error = new Error('Observation storage failed: 502');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify 503 status as worker unavailable', () => {
const error = new Error('Request failed: 503');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify "status: 500" format as worker unavailable', () => {
const error = new Error('HTTP error status: 500');
expect(isWorkerUnavailableError(error)).toBe(true);
});
});
describe('HTTP 429 rate limit → true (graceful)', () => {
it('should classify 429 as worker unavailable (rate limit is transient)', () => {
const error = new Error('Request failed: 429');
expect(isWorkerUnavailableError(error)).toBe(true);
});
it('should classify "status: 429" format as worker unavailable', () => {
const error = new Error('HTTP error status: 429');
expect(isWorkerUnavailableError(error)).toBe(true);
});
});
describe('HTTP 4xx client errors → false (blocking)', () => {
it('should NOT classify 400 Bad Request as worker unavailable', () => {
const error = new Error('Request failed: 400');
expect(isWorkerUnavailableError(error)).toBe(false);
});
it('should NOT classify 404 Not Found as worker unavailable', () => {
const error = new Error('Observation storage failed: 404');
expect(isWorkerUnavailableError(error)).toBe(false);
});
it('should NOT classify 422 Validation Error as worker unavailable', () => {
const error = new Error('Request failed: 422');
expect(isWorkerUnavailableError(error)).toBe(false);
});
it('should NOT classify "status: 400" format as worker unavailable', () => {
const error = new Error('HTTP error status: 400');
expect(isWorkerUnavailableError(error)).toBe(false);
});
});
describe('programming errors → false (blocking)', () => {
it('should NOT classify TypeError as worker unavailable', () => {
const error = new TypeError('Cannot read properties of undefined');
// Note: TypeError with "fetch failed" IS classified as unavailable (transport layer)
// But generic TypeErrors are NOT
expect(isWorkerUnavailableError(new TypeError('Cannot read properties of undefined'))).toBe(false);
});
it('should NOT classify ReferenceError as worker unavailable', () => {
const error = new ReferenceError('foo is not defined');
expect(isWorkerUnavailableError(error)).toBe(false);
});
it('should NOT classify SyntaxError as worker unavailable', () => {
const error = new SyntaxError('Unexpected token');
expect(isWorkerUnavailableError(error)).toBe(false);
});
});
describe('unknown errors → false (blocking, conservative)', () => {
it('should NOT classify generic Error as worker unavailable', () => {
const error = new Error('Something unexpected happened');
expect(isWorkerUnavailableError(error)).toBe(false);
});
it('should handle string errors', () => {
expect(isWorkerUnavailableError('ECONNREFUSED')).toBe(true);
expect(isWorkerUnavailableError('random error')).toBe(false);
});
it('should handle null/undefined errors', () => {
expect(isWorkerUnavailableError(null)).toBe(false);
expect(isWorkerUnavailableError(undefined)).toBe(false);
});
});
});

View File

@@ -28,18 +28,22 @@ describe('hook-constants', () => {
expect(HOOK_TIMEOUTS.DEFAULT).toBe(300000);
});
it('should define HEALTH_CHECK timeout', () => {
expect(HOOK_TIMEOUTS.HEALTH_CHECK).toBe(30000);
it('should define HEALTH_CHECK timeout as 3s (reduced from 30s)', () => {
expect(HOOK_TIMEOUTS.HEALTH_CHECK).toBe(3000);
});
it('should define POST_SPAWN_WAIT as 5s', () => {
expect(HOOK_TIMEOUTS.POST_SPAWN_WAIT).toBe(5000);
});
it('should define PORT_IN_USE_WAIT as 3s', () => {
expect(HOOK_TIMEOUTS.PORT_IN_USE_WAIT).toBe(3000);
});
it('should define WORKER_STARTUP_WAIT', () => {
expect(HOOK_TIMEOUTS.WORKER_STARTUP_WAIT).toBe(1000);
});
it('should define WORKER_STARTUP_RETRIES', () => {
expect(HOOK_TIMEOUTS.WORKER_STARTUP_RETRIES).toBe(300);
});
it('should define PRE_RESTART_SETTLE_DELAY', () => {
expect(HOOK_TIMEOUTS.PRE_RESTART_SETTLE_DELAY).toBe(2000);
});

View File

@@ -8,6 +8,8 @@ import {
removePidFile,
getPlatformTimeout,
parseElapsedTime,
isProcessAlive,
cleanStalePidFile,
type PidInfo
} from '../../src/services/infrastructure/index.js';
@@ -221,4 +223,69 @@ describe('ProcessManager', () => {
expect(result).toBe(666);
});
});
describe('isProcessAlive', () => {
it('should return true for the current process', () => {
expect(isProcessAlive(process.pid)).toBe(true);
});
it('should return false for a non-existent PID', () => {
// Use a very high PID that's extremely unlikely to exist
expect(isProcessAlive(2147483647)).toBe(false);
});
it('should return true for PID 0 (Windows WMIC sentinel)', () => {
expect(isProcessAlive(0)).toBe(true);
});
it('should return false for negative PIDs', () => {
expect(isProcessAlive(-1)).toBe(false);
expect(isProcessAlive(-999)).toBe(false);
});
it('should return false for non-integer PIDs', () => {
expect(isProcessAlive(1.5)).toBe(false);
expect(isProcessAlive(NaN)).toBe(false);
});
});
describe('cleanStalePidFile', () => {
it('should remove PID file when process is dead', () => {
// Write a PID file with a non-existent PID
const staleInfo: PidInfo = {
pid: 2147483647,
port: 37777,
startedAt: '2024-01-01T00:00:00.000Z'
};
writePidFile(staleInfo);
expect(existsSync(PID_FILE)).toBe(true);
cleanStalePidFile();
expect(existsSync(PID_FILE)).toBe(false);
});
it('should keep PID file when process is alive', () => {
// Write a PID file with the current process PID (definitely alive)
const liveInfo: PidInfo = {
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString()
};
writePidFile(liveInfo);
cleanStalePidFile();
// PID file should still exist since process.pid is alive
expect(existsSync(PID_FILE)).toBe(true);
});
it('should do nothing when PID file does not exist', () => {
removePidFile();
expect(existsSync(PID_FILE)).toBe(false);
// Should not throw
expect(() => cleanStalePidFile()).not.toThrow();
});
});
});