fix: GC failed pending_messages rows at startup (Greptile iter 4)

Plan 07 deleted clearFailed/clearFailedOlderThan as "dead code", but
with the periodic sweep also removed, nothing reaps status='failed'
rows now — they accumulate indefinitely. Since claimNextMessage's
self-healing subquery scans this table, unbounded growth degrades
claim latency over time.

Re-introduces clearFailedOlderThan and calls it once at worker startup
(not a reaper — one-shot, idempotent). 7-day retention keeps enough
history for operator inspection while bounding the table.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-04-24 14:50:29 -07:00
parent 1d6be0801e
commit a3d9c39cc0
3 changed files with 135 additions and 101 deletions

File diff suppressed because one or more lines are too long

View File

@@ -198,6 +198,22 @@ export class PendingMessageStore {
}
}
/**
* Delete `status='failed'` rows older than `thresholdMs`. Called once at
* worker startup so `pending_messages` does not grow unbounded on long-
* running or high-failure-rate installations; `claimNextMessage`'s
* self-healing subquery scans this table, so bounded rows keep claim
* latency predictable. Not a reaper — one-shot, idempotent.
*/
clearFailedOlderThan(thresholdMs: number): number {
const cutoff = Date.now() - thresholdMs;
const stmt = this.db.prepare(`
DELETE FROM pending_messages
WHERE status = 'failed' AND COALESCE(failed_at_epoch, completed_at_epoch, 0) < ?
`);
return stmt.run(cutoff).changes;
}
/**
* Get all pending messages for session (ordered by creation time)
*/

View File

@@ -445,6 +445,21 @@ export class WorkerService implements WorkerRef {
// left by a previous worker incarnation on the next claim. See
// PATHFINDER-2026-04-22 Plan 01 Phase 3.
// One-shot GC for terminally-failed rows so pending_messages does not
// grow unbounded on long-running or high-failure-rate installations.
// Not a reaper — runs once per worker start. 7 days retains enough
// history for operator inspection without degrading claim latency.
try {
const { PendingMessageStore } = await import('./sqlite/PendingMessageStore.js');
const pendingStore = new PendingMessageStore(this.dbManager.getSessionStore().db, 3);
const cleared = pendingStore.clearFailedOlderThan(7 * 24 * 60 * 60 * 1000);
if (cleared > 0) {
logger.info('QUEUE', 'Startup GC cleared old failed pending_messages rows', { cleared });
}
} catch (err) {
logger.warn('QUEUE', 'Startup GC for failed pending_messages rows failed', {}, err instanceof Error ? err : undefined);
}
// Initialize search services
const formattingService = new FormattingService();
const timelineService = new TimelineService();