Compare commits

...

8 Commits

Author SHA1 Message Date
Devin Foley
a82f9ec7eb Address Greptile 4/5 feedback: auto-clear idle badge, restore status color, fix display gap
- Clear idle_warning errorCode in onLog when output resumes, so the
  orange Idle badge disappears without requiring an API call
- Restore idle_timeout entry in status-colors.ts for errorCode-based
  badge styling
- Simplify LiveRunWidget time display to a single if/else chain,
  eliminating the gap for non-active runs with lastOutputAt but no
  finishedAt

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-27 08:37:50 -07:00
Devin Foley
e58a2330a6 Address Greptile feedback: clear debounce timer, deduplicate util, fix completed-run display
- Clear the lastOutputAt debounce timer on run completion to avoid
  stale writes after the run finishes
- Replace local formatRelativeTime with existing relativeTime util
- Fix completed-run display in LiveRunWidget to show finish time
  instead of unconditionally showing lastOutputAt

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-27 00:51:06 -07:00
Devin Foley
b3bccf3648 Fix LiveRunWidget activeRun merge bug and add idle_warning clear test
- Fix: activeRun merge in LiveRunWidget was dropping errorCode and
  lastOutputAt, preventing idle badge and last-output display from
  rendering for active runs
- Add test: verify reportRunActivity clears idle_warning error code
  and updates lastOutputAt

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-27 00:02:57 -07:00
Devin Foley
d747d847e4 Fix idle timeout tests to register in runningProcesses
The idle tests were failing because runs without runningProcesses
entries were being reaped by the orphan reaper before the idle
timeout pass could check them. Fix by spawning real child processes
and registering them in runningProcesses so the orphan reaper skips
them, allowing the idle timeout logic to evaluate them correctly.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-26 23:55:41 -07:00
Devin Foley
8e384947aa Address Greptile review: consolidate query, add idle tests, remove dead code
- Consolidate duplicate running-runs query in reapOrphanedRuns by
  reusing activeRuns for idle timeout pass (skip already-reaped runs)
- Add three integration tests: idle warning at 11 min, idle kill at
  16 min, and no-warning with recent output
- Remove unreachable idle_timeout entry from statusBadge (idle-killed
  runs have status "failed", not "idle_timeout")

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-26 23:48:58 -07:00
Devin Foley
aa88db7238 Fix test failure and address Greptile review comments
- Fix "keeps alive" test: set lastOutputAt to current time so idle
  reaper doesn't kill the test run (seed used a stale date)
- Wire up lastOutputAt and startedAt params in test seed fixture
- Remove dead wasIdle variable in clearDetachedRunWarning
- Fix deferred lastOutputAt flush to use actual last output timestamp
  instead of wall-clock time at flush

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-26 23:33:41 -07:00
Devin Foley
942d023148 Fix missing lastOutputAt in HeartbeatRun test mock
Add lastOutputAt: null to the makeRun helper in inbox.test.ts to match
the updated HeartbeatRun type.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-26 23:27:47 -07:00
Devin Foley
1b70091dcd Implement idle timeout reaper and liveness visibility for hung runs
Add lastOutputAt tracking on heartbeat runs, updated from adapter stdout/stderr
with batched DB writes (30s debounce). The orphan reaper now checks running
processes for idle timeout: warns at 10 min, kills (SIGTERM/SIGKILL) at 15 min.

- DB: add last_output_at column to heartbeat_runs (migration 0046)
- Server: idle warning/kill logic in reapOrphanedRuns, lastOutputAt flush in onLog
- Server: clear idle_warning on activity report (clearDetachedRunWarning)
- API: expose errorCode and lastOutputAt on active/recent run endpoints
- UI: orange idle warning badge on ActiveAgentsPanel and LiveRunWidget
- UI: show last output relative time in LiveRunWidget
- UI: add idle_timeout status color

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-26 23:05:47 -07:00
13 changed files with 12139 additions and 10 deletions

View File

@@ -0,0 +1 @@
ALTER TABLE "heartbeat_runs" ADD COLUMN "last_output_at" timestamp with time zone;

File diff suppressed because it is too large Load Diff

View File

@@ -323,6 +323,13 @@
"when": 1774530504348,
"tag": "0045_workable_shockwave",
"breakpoints": true
},
{
"idx": 46,
"version": "7",
"when": 1774589600897,
"tag": "0046_lethal_starfox",
"breakpoints": true
}
]
}

View File

@@ -33,6 +33,7 @@ export const heartbeatRuns = pgTable(
externalRunId: text("external_run_id"),
processPid: integer("process_pid"),
processStartedAt: timestamp("process_started_at", { withTimezone: true }),
lastOutputAt: timestamp("last_output_at", { withTimezone: true }),
retryOfRunId: uuid("retry_of_run_id").references((): AnyPgColumn => heartbeatRuns.id, {
onDelete: "set null",
}),

View File

@@ -35,6 +35,7 @@ export interface HeartbeatRun {
externalRunId: string | null;
processPid: number | null;
processStartedAt: Date | null;
lastOutputAt: Date | null;
retryOfRunId: string | null;
processLossRetryCount: number;
contextSnapshot: Record<string, unknown> | null;

View File

@@ -73,6 +73,8 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
includeIssue?: boolean;
runErrorCode?: string | null;
runError?: string | null;
lastOutputAt?: Date | null;
startedAt?: Date | null;
}) {
const companyId = randomUUID();
const agentId = randomUUID();
@@ -127,7 +129,8 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
processLossRetryCount: input?.processLossRetryCount ?? 0,
errorCode: input?.runErrorCode ?? null,
error: input?.runError ?? null,
startedAt: now,
startedAt: input?.startedAt ?? now,
lastOutputAt: input?.lastOutputAt ?? null,
updatedAt: new Date("2026-03-19T00:00:00.000Z"),
});
@@ -157,6 +160,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
const { runId, wakeupRequestId } = await seedRunFixture({
processPid: child.pid ?? null,
includeIssue: false,
lastOutputAt: new Date(),
});
const heartbeat = heartbeatService(db);
@@ -236,6 +240,82 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
expect(issue?.checkoutRunId).toBe(runId);
});
it("sets idle_warning when a run has no output for over 10 minutes", async () => {
const elevenMinutesAgo = new Date(Date.now() - 11 * 60 * 1000);
const child = spawnAliveProcess();
childProcesses.add(child);
const { runId } = await seedRunFixture({
includeIssue: false,
processPid: child.pid ?? null,
startedAt: elevenMinutesAgo,
lastOutputAt: elevenMinutesAgo,
});
// Register in runningProcesses so orphan reaper skips it; idle pass still checks it
runningProcesses.set(runId, { child, graceSec: 10 } as any);
const heartbeat = heartbeatService(db);
try {
const result = await heartbeat.reapOrphanedRuns();
expect(result.idleWarned).toBe(1);
expect(result.idleKilled).toBe(0);
const run = await heartbeat.getRun(runId);
expect(run?.status).toBe("running");
expect(run?.errorCode).toBe("idle_warning");
} finally {
runningProcesses.delete(runId);
}
});
it("kills a run that has been idle for over 15 minutes", async () => {
const sixteenMinutesAgo = new Date(Date.now() - 16 * 60 * 1000);
const child = spawnAliveProcess();
childProcesses.add(child);
const { runId } = await seedRunFixture({
processPid: child.pid ?? null,
startedAt: sixteenMinutesAgo,
lastOutputAt: sixteenMinutesAgo,
});
runningProcesses.set(runId, { child, graceSec: 10 } as any);
const heartbeat = heartbeatService(db);
try {
const result = await heartbeat.reapOrphanedRuns();
expect(result.idleKilled).toBe(1);
const run = await heartbeat.getRun(runId);
expect(run?.status).toBe("failed");
expect(run?.errorCode).toBe("idle_timeout");
} finally {
runningProcesses.delete(runId);
}
});
it("does not idle-warn a run with recent output", async () => {
const child = spawnAliveProcess();
childProcesses.add(child);
const { runId } = await seedRunFixture({
includeIssue: false,
processPid: child.pid ?? null,
startedAt: new Date(Date.now() - 20 * 60 * 1000),
lastOutputAt: new Date(),
});
runningProcesses.set(runId, { child, graceSec: 10 } as any);
const heartbeat = heartbeatService(db);
try {
const result = await heartbeat.reapOrphanedRuns();
expect(result.idleWarned).toBe(0);
expect(result.idleKilled).toBe(0);
const run = await heartbeat.getRun(runId);
expect(run?.status).toBe("running");
expect(run?.errorCode).toBeNull();
} finally {
runningProcesses.delete(runId);
}
});
it("clears the detached warning when the run reports activity again", async () => {
const { runId } = await seedRunFixture({
includeIssue: false,
@@ -252,4 +332,22 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
expect(run?.errorCode).toBeNull();
expect(run?.error).toBeNull();
});
it("clears the idle warning when the run reports activity again", async () => {
const { runId } = await seedRunFixture({
includeIssue: false,
runErrorCode: "idle_warning",
runError: "Idle warning: no output for 10 minutes",
});
const heartbeat = heartbeatService(db);
const updated = await heartbeat.reportRunActivity(runId);
expect(updated?.errorCode).toBeNull();
expect(updated?.error).toBeNull();
expect(updated?.lastOutputAt).toBeTruthy();
const run = await heartbeat.getRun(runId);
expect(run?.errorCode).toBeNull();
expect(run?.error).toBeNull();
});
});

View File

@@ -2083,6 +2083,8 @@ export function agentRoutes(db: Db) {
agentId: heartbeatRuns.agentId,
agentName: agentsTable.name,
adapterType: agentsTable.adapterType,
errorCode: heartbeatRuns.errorCode,
lastOutputAt: heartbeatRuns.lastOutputAt,
issueId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'issueId'`.as("issueId"),
};
@@ -2250,6 +2252,8 @@ export function agentRoutes(db: Db) {
agentId: heartbeatRuns.agentId,
agentName: agentsTable.name,
adapterType: agentsTable.adapterType,
errorCode: heartbeatRuns.errorCode,
lastOutputAt: heartbeatRuns.lastOutputAt,
})
.from(heartbeatRuns)
.innerJoin(agentsTable, eq(heartbeatRuns.agentId, agentsTable.id))

View File

@@ -63,6 +63,10 @@ const HEARTBEAT_MAX_CONCURRENT_RUNS_DEFAULT = 1;
const HEARTBEAT_MAX_CONCURRENT_RUNS_MAX = 10;
const DEFERRED_WAKE_CONTEXT_KEY = "_paperclipWakeContext";
const DETACHED_PROCESS_ERROR_CODE = "process_detached";
const IDLE_WARNING_ERROR_CODE = "idle_warning";
const IDLE_WARNING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
const IDLE_KILL_THRESHOLD_MS = 15 * 60 * 1000; // 15 minutes
const IDLE_KILL_GRACE_SEC = 10; // seconds between SIGTERM and SIGKILL
const startLocksByAgent = new Map<string, Promise<void>>();
const REPO_ONLY_CWD_SENTINEL = "/__paperclip_repo_only__";
const MANAGED_WORKSPACE_GIT_CLONE_TIMEOUT_MS = 10 * 60 * 1000;
@@ -1484,14 +1488,20 @@ export function heartbeatService(db: Db) {
}
async function clearDetachedRunWarning(runId: string) {
// Clear both detached process warnings and idle warnings when activity is reported
const updated = await db
.update(heartbeatRuns)
.set({
error: null,
errorCode: null,
lastOutputAt: new Date(),
updatedAt: new Date(),
})
.where(and(eq(heartbeatRuns.id, runId), eq(heartbeatRuns.status, "running"), eq(heartbeatRuns.errorCode, DETACHED_PROCESS_ERROR_CODE)))
.where(and(
eq(heartbeatRuns.id, runId),
eq(heartbeatRuns.status, "running"),
inArray(heartbeatRuns.errorCode, [DETACHED_PROCESS_ERROR_CODE, IDLE_WARNING_ERROR_CODE]),
))
.returning()
.then((rows) => rows[0] ?? null);
if (!updated) return null;
@@ -1500,7 +1510,7 @@ export function heartbeatService(db: Db) {
eventType: "lifecycle",
stream: "system",
level: "info",
message: "Detached child process reported activity; cleared detached warning",
message: "Activity reported; cleared run warning",
});
return updated;
}
@@ -1826,7 +1836,100 @@ export function heartbeatService(db: Db) {
if (reaped.length > 0) {
logger.warn({ reapedCount: reaped.length, runIds: reaped }, "reaped orphaned heartbeat runs");
}
return { reaped: reaped.length, runIds: reaped };
// ── Idle-timeout pass: reuse activeRuns query result (no duplicate DB call) ──
const idleWarned: string[] = [];
const idleKilled: string[] = [];
for (const { run, adapterType } of activeRuns) {
// Idle check applies to all running runs, including tracked ones
if (reaped.includes(run.id)) continue;
const tracksLocalChild = isTrackedLocalChildProcessAdapter(adapterType);
if (!tracksLocalChild) continue;
// Determine the most recent activity timestamp
const lastActivity = run.lastOutputAt
? new Date(run.lastOutputAt).getTime()
: run.startedAt
? new Date(run.startedAt).getTime()
: new Date(run.createdAt).getTime();
const idleMs = now.getTime() - lastActivity;
if (idleMs >= IDLE_KILL_THRESHOLD_MS) {
// Kill: SIGTERM then SIGKILL
const tracked = runningProcesses.get(run.id);
const pid = tracked?.child?.pid ?? run.processPid;
const killMessage = `Idle timeout: no output for ${Math.round(idleMs / 60_000)} minutes — killing process (pid ${pid ?? "unknown"})`;
logger.warn({ runId: run.id, pid, idleMs }, killMessage);
if (tracked?.child) {
tracked.child.kill("SIGTERM");
setTimeout(() => {
try { if (!tracked.child.killed) tracked.child.kill("SIGKILL"); } catch {}
}, IDLE_KILL_GRACE_SEC * 1000);
} else if (pid && isProcessAlive(pid)) {
try { process.kill(pid, "SIGTERM"); } catch {}
setTimeout(() => {
try { if (isProcessAlive(pid)) process.kill(pid, "SIGKILL"); } catch {}
}, IDLE_KILL_GRACE_SEC * 1000);
}
// Mark the run as failed
let killedRun = await setRunStatus(run.id, "failed", {
error: killMessage,
errorCode: "idle_timeout",
finishedAt: now,
});
await setWakeupStatus(run.wakeupRequestId, "failed", {
finishedAt: now,
error: killMessage,
});
if (!killedRun) killedRun = await getRun(run.id);
if (killedRun) {
await appendRunEvent(killedRun, await nextRunEventSeq(killedRun.id), {
eventType: "lifecycle",
stream: "system",
level: "error",
message: killMessage,
payload: { pid, idleMs },
});
await releaseIssueExecutionAndPromote(killedRun);
}
await finalizeAgentStatus(run.agentId, "failed");
await startNextQueuedRunForAgent(run.agentId);
runningProcesses.delete(run.id);
idleKilled.push(run.id);
} else if (idleMs >= IDLE_WARNING_THRESHOLD_MS && run.errorCode !== IDLE_WARNING_ERROR_CODE) {
// Warning: approaching idle timeout
const warnMessage = `Idle warning: no output for ${Math.round(idleMs / 60_000)} minutes — will be killed at ${IDLE_KILL_THRESHOLD_MS / 60_000} min`;
logger.warn({ runId: run.id, idleMs }, warnMessage);
const warnedRun = await setRunStatus(run.id, "running", {
error: warnMessage,
errorCode: IDLE_WARNING_ERROR_CODE,
});
if (warnedRun) {
await appendRunEvent(warnedRun, await nextRunEventSeq(warnedRun.id), {
eventType: "lifecycle",
stream: "system",
level: "warn",
message: warnMessage,
payload: { idleMs },
});
}
idleWarned.push(run.id);
}
}
if (idleWarned.length > 0) {
logger.info({ count: idleWarned.length, runIds: idleWarned }, "idle-warned runs");
}
if (idleKilled.length > 0) {
logger.warn({ count: idleKilled.length, runIds: idleKilled }, "idle-killed runs");
}
return { reaped: reaped.length, runIds: reaped, idleWarned: idleWarned.length, idleKilled: idleKilled.length };
}
async function resumeQueuedRuns() {
@@ -2397,11 +2500,38 @@ export function heartbeatService(db: Db) {
.where(eq(heartbeatRuns.id, runId));
const currentUserRedactionOptions = await getCurrentUserRedactionOptions();
let lastOutputAtFlushPending = false;
let lastOutputAtLatest: Date | null = null;
let lastOutputAtTimer: ReturnType<typeof setTimeout> | null = null;
const onLog = async (stream: "stdout" | "stderr", chunk: string) => {
const sanitizedChunk = redactCurrentUserText(chunk, currentUserRedactionOptions);
if (stream === "stdout") stdoutExcerpt = appendExcerpt(stdoutExcerpt, sanitizedChunk);
if (stream === "stderr") stderrExcerpt = appendExcerpt(stderrExcerpt, sanitizedChunk);
const ts = new Date().toISOString();
lastOutputAtLatest = new Date(ts);
// Batch lastOutputAt writes — flush at most once per 30 seconds to avoid DB churn
if (!lastOutputAtFlushPending) {
lastOutputAtFlushPending = true;
// Flush immediately on first output
await db.update(heartbeatRuns)
.set({ lastOutputAt: lastOutputAtLatest, updatedAt: new Date() })
.where(eq(heartbeatRuns.id, runId));
// Clear idle_warning if set — output resumed
await db.update(heartbeatRuns)
.set({ errorCode: null, error: null })
.where(and(eq(heartbeatRuns.id, runId), eq(heartbeatRuns.errorCode, IDLE_WARNING_ERROR_CODE)));
lastOutputAtTimer = setTimeout(() => {
lastOutputAtFlushPending = false;
lastOutputAtTimer = null;
if (!lastOutputAtLatest) return;
db.update(heartbeatRuns)
.set({ lastOutputAt: lastOutputAtLatest, updatedAt: new Date() })
.where(eq(heartbeatRuns.id, runId))
.then(() => {})
.catch((err) => logger.warn({ err, runId }, "failed to flush lastOutputAt"));
}, 30_000);
}
if (handle) {
await runLogStore.append(handle, {
@@ -2525,6 +2655,8 @@ export function heartbeatService(db: Db) {
},
authToken: authToken ?? undefined,
});
// Clear the debounce timer now that the run is complete
if (lastOutputAtTimer) { clearTimeout(lastOutputAtTimer); lastOutputAtTimer = null; }
const adapterManagedRuntimeServices = adapterResult.runtimeServices
? await persistAdapterManagedRuntimeServices({
db,

View File

@@ -23,6 +23,8 @@ export interface LiveRunForIssue {
agentId: string;
agentName: string;
adapterType: string;
errorCode?: string | null;
lastOutputAt?: string | null;
issueId?: string | null;
}

View File

@@ -92,9 +92,11 @@ function AgentRunCard({
return (
<div className={cn(
"flex h-[320px] flex-col overflow-hidden rounded-xl border shadow-sm",
isActive
? "border-cyan-500/25 bg-cyan-500/[0.04] shadow-[0_16px_40px_rgba(6,182,212,0.08)]"
: "border-border bg-background/70",
run.errorCode === "idle_warning"
? "border-orange-500/25 bg-orange-500/[0.04] shadow-[0_16px_40px_rgba(249,115,22,0.08)]"
: isActive
? "border-cyan-500/25 bg-cyan-500/[0.04] shadow-[0_16px_40px_rgba(6,182,212,0.08)]"
: "border-border bg-background/70",
)}>
<div className="border-b border-border/60 px-3 py-3">
<div className="flex items-start justify-between gap-2">
@@ -112,6 +114,11 @@ function AgentRunCard({
</div>
<div className="mt-2 flex items-center gap-2 text-[11px] text-muted-foreground">
<span>{isActive ? "Live now" : run.finishedAt ? `Finished ${relativeTime(run.finishedAt)}` : `Started ${relativeTime(run.createdAt)}`}</span>
{run.errorCode === "idle_warning" && (
<span className="rounded-full border border-orange-500/20 bg-orange-500/[0.06] px-1.5 py-0.5 text-[10px] font-medium text-orange-700 dark:text-orange-300">
Idle
</span>
)}
</div>
</div>

View File

@@ -3,8 +3,8 @@ import { Link } from "@/lib/router";
import { useQuery, useQueryClient } from "@tanstack/react-query";
import { heartbeatsApi, type LiveRunForIssue } from "../api/heartbeats";
import { queryKeys } from "../lib/queryKeys";
import { formatDateTime } from "../lib/utils";
import { ExternalLink, Square } from "lucide-react";
import { formatDateTime, relativeTime } from "../lib/utils";
import { AlertTriangle, ExternalLink, Square } from "lucide-react";
import { Identity } from "./Identity";
import { StatusBadge } from "./StatusBadge";
import { RunTranscriptView } from "./transcript/RunTranscriptView";
@@ -24,6 +24,7 @@ function isRunActive(status: string): boolean {
return status === "queued" || status === "running";
}
export function LiveRunWidget({ issueId, companyId }: LiveRunWidgetProps) {
const queryClient = useQueryClient();
const [cancellingRunIds, setCancellingRunIds] = useState(new Set<string>());
@@ -59,6 +60,8 @@ export function LiveRunWidget({ issueId, companyId }: LiveRunWidgetProps) {
agentId: activeRun.agentId,
agentName: activeRun.agentName,
adapterType: activeRun.adapterType,
errorCode: activeRun.errorCode,
lastOutputAt: toIsoString(activeRun.lastOutputAt),
issueId,
});
}
@@ -116,7 +119,21 @@ export function LiveRunWidget({ issueId, companyId }: LiveRunWidgetProps) {
{run.id.slice(0, 8)}
</Link>
<StatusBadge status={run.status} />
<span>{formatDateTime(run.startedAt ?? run.createdAt)}</span>
{run.errorCode === "idle_warning" && (
<span className="inline-flex items-center gap-1 rounded-full border border-orange-500/20 bg-orange-500/[0.06] px-2 py-0.5 text-[11px] font-medium text-orange-700 dark:text-orange-300">
<AlertTriangle className="h-3 w-3" />
Idle
</span>
)}
{isRunActive(run.status) && run.lastOutputAt ? (
<span title={`Last output: ${formatDateTime(run.lastOutputAt)}`}>
Last output {relativeTime(run.lastOutputAt)}
</span>
) : run.finishedAt ? (
<span>{formatDateTime(run.finishedAt)}</span>
) : (
<span>{formatDateTime(run.startedAt ?? run.createdAt)}</span>
)}
</div>
</div>

View File

@@ -113,6 +113,7 @@ function makeRun(id: string, status: HeartbeatRun["status"], createdAt: string,
externalRunId: null,
processPid: null,
processStartedAt: null,
lastOutputAt: null,
retryOfRunId: null,
processLossRetryCount: 0,
stdoutExcerpt: null,

View File

@@ -55,6 +55,7 @@ export const statusBadge: Record<string, string> = {
// Run statuses
failed: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",
timed_out: "bg-orange-100 text-orange-700 dark:bg-orange-900/50 dark:text-orange-300",
idle_timeout: "bg-orange-100 text-orange-700 dark:bg-orange-900/50 dark:text-orange-300",
succeeded: "bg-green-100 text-green-700 dark:bg-green-900/50 dark:text-green-300",
error: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",
terminated: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",