Merge branch 'pr-1491' into integration/validation-batch

# Conflicts:
#	plugin/scripts/mcp-server.cjs
#	plugin/scripts/worker-service.cjs
#	src/shared/hook-constants.ts
This commit is contained in:
Alex Newman
2026-04-06 14:20:05 -07:00
6 changed files with 317 additions and 328 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -453,6 +453,19 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const pidsToKill: number[] = [];
const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS];
// Protect parent process (the hook that spawned us) from being killed.
// Without this, a new daemon kills its own parent hook process (#1426).
//
// Note: readPidFile() is not used here because start() writes the new PID
// before initializeBackground() calls this function, so readPidFile() would
// just return process.pid (already protected). If a pre-existing worker needs
// protection, ensureWorkerStarted() handles that by returning early when a
// healthy worker is detected — we never reach this code in that case.
const protectedPids = new Set<number>([currentPid]);
if (process.ppid && process.ppid > 0) {
protectedPids.add(process.ppid);
}
try {
if (isWindows) {
// Use WQL -Filter for server-side filtering (no $_ pipeline syntax).
@@ -475,7 +488,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
for (const proc of processList) {
const pid = proc.ProcessId;
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const commandLine = proc.CommandLine || '';
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p));
@@ -518,7 +531,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const etime = match[2];
const command = match[3];
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p));

View File

@@ -80,7 +80,6 @@ import {
cleanStalePidFile,
isProcessAlive,
spawnDaemon,
isPidFileRecent,
touchPidFile
} from './infrastructure/ProcessManager.js';
import {
@@ -88,8 +87,7 @@ import {
waitForHealth,
waitForReadiness,
waitForPortFree,
httpShutdown,
checkVersionMatch
httpShutdown
} from './infrastructure/HealthMonitor.js';
import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js';
@@ -1011,43 +1009,25 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
return false;
}
// Check if worker is already running and healthy
// Check if worker is already running and healthy.
// NOTE: Version mismatch auto-restart intentionally removed (#1435).
// The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked, causing
// BUILT_IN_VERSION to fall back to "development". This creates a 100% reproducible
// mismatch on every hook call, killing a healthy worker and often failing to restart
// (cold start exceeds POST_SPAWN_WAIT). A working-but-old worker is strictly better
// than a dead worker. Users must manually restart after genuine plugin updates.
// See also: #566, #665, #667, #669, #689, #1124, #1145 (same pattern across 8+ releases).
if (await waitForHealth(port, 1000)) {
const versionCheck = await checkVersionMatch(port);
if (!versionCheck.matches) {
// Guard: If PID file was written recently, another session is likely already
// restarting the worker. Poll health instead of starting a concurrent restart.
// This prevents the "100 sessions all restart simultaneously" storm (#1145).
const RESTART_COORDINATION_THRESHOLD_MS = 15000;
if (isPidFileRecent(RESTART_COORDINATION_THRESHOLD_MS)) {
logger.info('SYSTEM', 'Version mismatch detected but PID file is recent — another restart likely in progress, polling health', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
const healthy = await waitForHealth(port, RESTART_COORDINATION_THRESHOLD_MS);
if (healthy) {
logger.info('SYSTEM', 'Worker became healthy after waiting for concurrent restart');
return true;
}
logger.warn('SYSTEM', 'Worker did not become healthy after waiting — proceeding with own restart');
}
logger.info('SYSTEM', 'Worker version mismatch detected - auto-restarting', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
await httpShutdown(port);
const freed = await waitForPortFree(port, getPlatformTimeout(HOOK_TIMEOUTS.PORT_IN_USE_WAIT));
if (!freed) {
logger.error('SYSTEM', 'Port did not free up after shutdown for version mismatch restart', { port });
return false;
}
removePidFile();
} else {
logger.info('SYSTEM', 'Worker already running and healthy');
return true;
// Health passed — worker is listening. Also wait for readiness in case
// another hook just spawned it and background init is still running.
// This mirrors the fresh-spawn path (line ~1025) so concurrent hooks
// don't race past a cold-starting worker's initialization guard.
const ready = await waitForReadiness(port, getPlatformTimeout(HOOK_TIMEOUTS.READINESS_WAIT));
if (!ready) {
logger.warn('SYSTEM', 'Worker is alive but readiness timed out — proceeding anyway');
}
logger.info('SYSTEM', 'Worker already running and healthy');
return true;
}
// Check if port is in use by something else
@@ -1096,8 +1076,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
}
clearWorkerSpawnAttempted();
// Touch PID file to signal other sessions that a restart just completed.
// Other sessions checking isPidFileRecent() will see this and skip their own restart.
// Touch PID file to signal other sessions that a spawn just completed.
touchPidFile();
logger.info('SYSTEM', 'Worker started successfully');
return true;

View File

@@ -1,13 +1,13 @@
export const HOOK_TIMEOUTS = {
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
POST_SPAWN_WAIT: 5000, // Wait for daemon to start after spawn (starts in <1s on Linux)
READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s)
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
POST_SPAWN_WAIT: 15000, // Wait for daemon to start after spawn (starts in <1s on Linux, 6-8s on macOS with Chroma)
READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s)
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
WORKER_STARTUP_WAIT: 1000,
PRE_RESTART_SETTLE_DELAY: 2000, // Give files time to sync before restart
POWERSHELL_COMMAND: 10000, // PowerShell process enumeration (10s - typically completes in <1s)
WINDOWS_MULTIPLIER: 1.5, // Platform-specific adjustment for hook-side operations
PRE_RESTART_SETTLE_DELAY: 2000, // Give files time to sync before restart
POWERSHELL_COMMAND: 10000, // PowerShell process enumeration (10s - typically completes in <1s)
WINDOWS_MULTIPLIER: 1.5 // Platform-specific adjustment for hook-side operations
} as const;
/**
@@ -16,21 +16,19 @@ export const HOOK_TIMEOUTS = {
* Exit code behavior per Claude Code docs:
* - 0: Success. For SessionStart/UserPromptSubmit, stdout added to context.
* - 2: Blocking error. For SessionStart, stderr shown to user only.
* - Other non-zero: treated as hook error (stderr shown in verbose mode only).
*
* IMPORTANT: Only exit codes 0 and 2 are recognized by Claude Code.
* Any other non-zero exit code (including 1, 3, etc.) is treated as a
* hook failure and triggers the "hook error" UI message on every session start.
* - Other non-zero: stderr shown in verbose mode only.
*/
export const HOOK_EXIT_CODES = {
SUCCESS: 0,
FAILURE: 1,
/** Blocking error - for SessionStart, shows stderr to user only */
BLOCKING_ERROR: 2,
/** Show stderr to user only, don't inject into context. Used by user-message handler (Cursor). */
USER_MESSAGE_ONLY: 3,
} as const;
export function getTimeout(baseTimeout: number): number {
return process.platform === "win32"
return process.platform === 'win32'
? Math.round(baseTimeout * HOOK_TIMEOUTS.WINDOWS_MULTIPLIER)
: baseTimeout;
}

View File

@@ -32,8 +32,8 @@ describe('hook-constants', () => {
expect(HOOK_TIMEOUTS.HEALTH_CHECK).toBe(3000);
});
it('should define POST_SPAWN_WAIT as 5s', () => {
expect(HOOK_TIMEOUTS.POST_SPAWN_WAIT).toBe(5000);
it('should define POST_SPAWN_WAIT as 15s', () => {
expect(HOOK_TIMEOUTS.POST_SPAWN_WAIT).toBe(15000);
});
it('should define PORT_IN_USE_WAIT as 3s', () => {