From df0ab0c0c912a3d70bee054a9c4d88dcf4131380 Mon Sep 17 00:00:00 2001 From: Tom Boucher Date: Thu, 23 Apr 2026 11:54:11 -0400 Subject: [PATCH] fix(#2410): emit wave + plan checkpoint heartbeats to prevent stream idle timeout (#2626) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /gsd:manager's background execute-phase Task fails with "Stream idle timeout - partial response received" on multi-plan phases (Claude Code + Opus 4.7 at ~200K+ cache_read) because the long subagent never emits tokens fast enough between large tool_results — the SSE layer times out mid-assistant-turn and the harness retries hit the same TTFT wall after prompt cache TTL expires. Root cause: no orchestrator-level activity at wave/plan boundaries. Fix (maintainer-approved A+B): - A (wave boundary): execute-phase.md now emits a `[checkpoint]` heartbeat before each wave spawns and after each wave completes. - B (plan boundary): also emit `[checkpoint]` before each Task() dispatch and after each executor returns (complete/failed/checkpoint). Heartbeats are literal assistant-text lines (no tool call) with a monotonic `{P}/{Q} plans done` counter so partial-transcript recovery tools can grep progress even when a run dies mid-phase. Docs: COMMANDS.md /gsd-manager section documents the marker format. Tests: tests/bug-2410-stream-checkpoint-heartbeats.test.cjs (12 cases) asserts the heartbeats exist at every boundary and in the right workflow step. Full suite: 5422 node:test cases pass. Pre-existing vitest failures on main are unrelated to this change. Co-authored-by: Claude Opus 4.7 (1M context) --- docs/COMMANDS.md | 18 ++ get-shit-done/workflows/execute-phase.md | 56 +++++- ...2410-stream-checkpoint-heartbeats.test.cjs | 173 ++++++++++++++++++ 3 files changed, 246 insertions(+), 1 deletion(-) create mode 100644 tests/bug-2410-stream-checkpoint-heartbeats.test.cjs diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md index ae8de556..f9d5dcc7 100644 --- a/docs/COMMANDS.md +++ b/docs/COMMANDS.md @@ -562,6 +562,24 @@ Interactive command center for managing multiple phases from one terminal. /gsd-manager # Open command center dashboard ``` +**Checkpoint Heartbeats (#2410):** + +Background `execute-phase` runs emit `[checkpoint]` markers at every wave and plan +boundary so the Claude API SSE stream never idles long enough to trigger +`Stream idle timeout - partial response received` on multi-plan phases. The +format is: + +``` +[checkpoint] phase {N} wave {W}/{M} starting, {count} plan(s), {P}/{Q} plans done +[checkpoint] phase {N} wave {W}/{M} plan {plan_id} starting ({P}/{Q} plans done) +[checkpoint] phase {N} wave {W}/{M} plan {plan_id} complete ({P}/{Q} plans done) +[checkpoint] phase {N} wave {W}/{M} complete, {P}/{Q} plans done ({ok}/{count} ok) +``` + +If a background phase fails partway through, grep the transcript for `[checkpoint]` +to see the last confirmed boundary. The manager's background-completion handler +uses these markers to report partial progress when an agent errors out. + **Manager Passthrough Flags:** Configure per-step flags in `.planning/config.json` under `manager.flags`. These flags are appended to each dispatched command: diff --git a/get-shit-done/workflows/execute-phase.md b/get-shit-done/workflows/execute-phase.md index 025f7488..19b26525 100644 --- a/get-shit-done/workflows/execute-phase.md +++ b/get-shit-done/workflows/execute-phase.md @@ -336,6 +336,26 @@ CROSS_AI_TIMEOUT=$(gsd-sdk query config-get workflow.cross_ai_timeout 2>/dev/nul Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZATION=true`, sequential if `false`. +**Stream-idle-timeout prevention — checkpoint heartbeats (#2410):** + +Multi-plan phases can accumulate enough subagent context that the Claude API +SSE layer terminates with `Stream idle timeout - partial response received` +between a large tool_result and the next assistant turn (seen on Claude Code ++ Opus 4.7 at ~200K+ cache_read). To keep the stream warm, emit short +assistant-text heartbeats — **no tool call, just a literal line** — at every +wave and plan boundary. Each heartbeat MUST start with `[checkpoint]` so +tooling and `/gsd:manager`'s background-completion handler can grep partial +transcripts. `{P}/{Q}` is the phase-wide completed/total plans counter and +increases monotonically across waves. `{status}` is `complete` (success), +`failed` (executor error), or `checkpoint` (human-gate returned). + +``` +[checkpoint] phase {PHASE_NUMBER} wave {N}/{M} starting, {wave_plan_count} plan(s), {P}/{Q} plans done +[checkpoint] phase {PHASE_NUMBER} wave {N}/{M} plan {plan_id} starting ({P}/{Q} plans done) +[checkpoint] phase {PHASE_NUMBER} wave {N}/{M} plan {plan_id} {status} ({P}/{Q} plans done) +[checkpoint] phase {PHASE_NUMBER} wave {N}/{M} complete, {P}/{Q} plans done ({wave_success}/{wave_plan_count} ok) +``` + **For each wave:** 1. **Intra-wave files_modified overlap check (BEFORE spawning):** @@ -374,7 +394,15 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT 2. **Describe what's being built (BEFORE spawning):** - Read each plan's ``. Extract what's being built and why. + **First, emit the wave-start checkpoint heartbeat as a literal assistant-text + line — no tool call (#2410). Do NOT skip this even for single-plan waves; it + is required before any further reasoning or spawning:** + + ``` + [checkpoint] phase {PHASE_NUMBER} wave {N}/{M} starting, {wave_plan_count} plan(s), {P}/{Q} plans done + ``` + + Then read each plan's ``. Extract what's being built and why. ``` --- @@ -392,6 +420,13 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT 3. **Spawn executor agents:** + **Emit a plan-start heartbeat (literal line, no tool call) immediately before + each `Task()` dispatch (#2410):** + + ``` + [checkpoint] phase {PHASE_NUMBER} wave {N}/{M} plan {plan_id} starting ({P}/{Q} plans done) + ``` + Pass paths only — executors read files themselves with their fresh context window. For 200k models, this keeps orchestrator context lean (~10-15%). For 1M+ models (Opus 4.6, Sonnet 4.6), richer context can be passed directly. @@ -552,6 +587,16 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT 4. **Wait for all agents in wave to complete.** + **Plan-complete heartbeat (#2410):** as each executor returns (or is verified + via spot-check below), emit one line — `complete` advances `{P}`, `failed` + and `checkpoint` do not but still warm the stream: + + ``` + [checkpoint] phase {PHASE_NUMBER} wave {N}/{M} plan {plan_id} complete ({P}/{Q} plans done) + [checkpoint] phase {PHASE_NUMBER} wave {N}/{M} plan {plan_id} failed ({P}/{Q} plans done) + [checkpoint] phase {PHASE_NUMBER} wave {N}/{M} plan {plan_id} checkpoint ({P}/{Q} plans done) + ``` + **Completion signal fallback (Copilot and runtimes where Task() may not return):** If a spawned agent does not return a completion signal but appears to have finished @@ -830,6 +875,15 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT 6. **Report completion — spot-check claims first:** + **Wave-close heartbeat (#2410):** after spot-checks finish (pass or fail), + before the `## Wave {N} Complete` summary, emit as a literal line: + + ``` + [checkpoint] phase {PHASE_NUMBER} wave {N}/{M} complete, {P}/{Q} plans done ({wave_success}/{wave_plan_count} ok) + ``` + + + For each SUMMARY.md: - Verify first 2 files from `key-files.created` exist on disk - Check `git log --oneline --all --grep="{phase}-{plan}"` returns ≥1 commit diff --git a/tests/bug-2410-stream-checkpoint-heartbeats.test.cjs b/tests/bug-2410-stream-checkpoint-heartbeats.test.cjs new file mode 100644 index 00000000..1107cdf7 --- /dev/null +++ b/tests/bug-2410-stream-checkpoint-heartbeats.test.cjs @@ -0,0 +1,173 @@ +/** + * Bug #2410 — /gsd:manager background execute-phase Task fails with + * "Stream idle timeout" on multi-plan phases. + * + * Fix: execute-phase.md instructs the orchestrator to emit `[checkpoint]` + * heartbeat lines at every wave boundary AND every plan boundary so the + * Claude API SSE stream never idles long enough to trigger the platform + * timeout. This test validates the workflow contract that backs that fix. + */ + +const { test, describe } = require('node:test'); +const assert = require('node:assert/strict'); +const fs = require('fs'); +const path = require('path'); + +const WORKFLOW_PATH = path.join( + __dirname, + '..', + 'get-shit-done', + 'workflows', + 'execute-phase.md' +); +const COMMANDS_DOC_PATH = path.join(__dirname, '..', 'docs', 'COMMANDS.md'); + +describe('bug #2410: execute-phase emits checkpoint heartbeats', () => { + const workflow = fs.readFileSync(WORKFLOW_PATH, 'utf-8'); + + test('workflow references the stream idle timeout symptom by name', () => { + assert.ok( + /Stream idle timeout/.test(workflow), + 'workflow should name the API error it is preventing' + ); + assert.ok( + workflow.includes('#2410'), + 'workflow should cite the tracking issue for future maintainers' + ); + }); + + test('workflow defines a [checkpoint] heartbeat line format', () => { + assert.ok( + workflow.includes('[checkpoint]'), + 'workflow should document the [checkpoint] marker prefix' + ); + }); + + test('workflow emits a wave-start heartbeat (A: wave-boundary checkpoint)', () => { + assert.ok( + /\[checkpoint\][^\n]*wave \{N\}\/\{M\} starting/.test(workflow), + 'workflow should emit a wave-start [checkpoint] marker before spawning agents' + ); + }); + + test('workflow emits a wave-complete heartbeat (A: wave-boundary checkpoint)', () => { + assert.ok( + /\[checkpoint\][^\n]*wave \{N\}\/\{M\} complete/.test(workflow), + 'workflow should emit a wave-complete [checkpoint] marker after spot-checks' + ); + }); + + test('workflow emits a plan-start heartbeat (B: plan-boundary checkpoint)', () => { + assert.ok( + /\[checkpoint\][^\n]*plan \{plan_id\} starting/.test(workflow), + 'workflow should emit a plan-start [checkpoint] marker before each Task() dispatch' + ); + }); + + test('workflow emits a plan-complete heartbeat (B: plan-boundary checkpoint)', () => { + assert.ok( + /\[checkpoint\][^\n]*plan \{plan_id\} complete/.test(workflow), + 'workflow should emit a plan-complete [checkpoint] marker after executor returns' + ); + }); + + test('workflow handles plan failure and checkpoint-gate heartbeats too', () => { + assert.ok( + /\[checkpoint\][^\n]*plan \{plan_id\} failed/.test(workflow), + 'workflow should emit a plan-failed [checkpoint] marker on executor error' + ); + assert.ok( + /\[checkpoint\][^\n]*plan \{plan_id\} checkpoint/.test(workflow), + 'workflow should emit a heartbeat when a plan returns a human-gate checkpoint' + ); + }); + + test('heartbeats include a monotonic plans-done counter', () => { + // The {P}/{Q} counter lets grep-based recovery tools reconstruct progress + // from a truncated transcript if the agent dies mid-phase. + assert.ok( + /\{P\}\/\{Q\} plans done/.test(workflow), + 'heartbeats should include a {P}/{Q} phase-wide completed-plan counter' + ); + }); + + test('wave-start heartbeat precedes the "Describe what\'s being built" text', () => { + const describeIdx = workflow.indexOf("Describe what's being built"); + const heartbeatIdx = workflow.indexOf( + '[checkpoint] phase {PHASE_NUMBER} wave {N}/{M} starting' + ); + assert.ok(describeIdx !== -1, 'workflow should still have the describe step'); + assert.ok(heartbeatIdx !== -1, 'wave-start heartbeat template should be present'); + // The instruction to emit the heartbeat appears in step 2, which is the + // step titled "Describe what's being built". The actual sentinel text we + // look for is the inline literal template — it must be emitted BEFORE any + // tool calls in that step. + const step2 = workflow.slice( + describeIdx, + workflow.indexOf('3. **Spawn executor agents', describeIdx) + ); + assert.ok( + step2.includes('[checkpoint]'), + 'step 2 should instruct the orchestrator to emit a [checkpoint] heartbeat' + ); + assert.ok( + /before any further reasoning or spawning/i.test(step2) || + /before any tool call/i.test(step2) || + /no tool call/i.test(step2), + 'step 2 should make clear the heartbeat is an assistant-text line, not a tool call' + ); + }); + + test('plan-start heartbeat is inside the spawn step', () => { + const spawnIdx = workflow.indexOf('3. **Spawn executor agents'); + const waitIdx = workflow.indexOf('4. **Wait for all agents', spawnIdx); + assert.ok(spawnIdx !== -1 && waitIdx !== -1, 'spawn and wait steps must exist'); + const step3 = workflow.slice(spawnIdx, waitIdx); + assert.ok( + /\[checkpoint\][^\n]*plan \{plan_id\} starting/.test(step3), + 'plan-start heartbeat should be emitted inside step 3 (spawn executor agents)' + ); + }); + + test('plan-complete and wave-complete heartbeats are inside the wait/report steps', () => { + const waitIdx = workflow.indexOf('4. **Wait for all agents'); + const hookIdx = workflow.indexOf('5. **Post-wave hook validation', waitIdx); + assert.ok(waitIdx !== -1 && hookIdx !== -1, 'wait + hook steps must exist'); + const step4 = workflow.slice(waitIdx, hookIdx); + assert.ok( + /\[checkpoint\][^\n]*plan \{plan_id\} complete/.test(step4), + 'plan-complete heartbeat should be emitted in step 4 (wait for agents)' + ); + + const reportIdx = workflow.indexOf('6. **Report completion'); + const failureIdx = workflow.indexOf('7. **Handle failures', reportIdx); + assert.ok(reportIdx !== -1 && failureIdx !== -1, 'report + failure steps must exist'); + const step6 = workflow.slice(reportIdx, failureIdx); + assert.ok( + /\[checkpoint\][^\n]*wave \{N\}\/\{M\} complete/.test(step6), + 'wave-complete heartbeat should be emitted in step 6 (report completion)' + ); + }); +}); + +describe('bug #2410: checkpoint heartbeat format is user-documented', () => { + const commandsDoc = fs.readFileSync(COMMANDS_DOC_PATH, 'utf-8'); + + test('COMMANDS.md documents the [checkpoint] format under /gsd-manager', () => { + const managerIdx = commandsDoc.indexOf('### `/gsd-manager`'); + assert.ok(managerIdx !== -1, '/gsd-manager section should exist'); + const section = commandsDoc.slice(managerIdx, managerIdx + 4000); + assert.ok( + /\[checkpoint\]/.test(section), + 'COMMANDS.md /gsd-manager section should document [checkpoint] heartbeat markers' + ); + assert.ok( + /Stream idle timeout/i.test(section), + 'COMMANDS.md should explain what the heartbeats prevent' + ); + assert.ok( + /#2410/.test(section), + 'COMMANDS.md should reference the tracking issue' + ); + }); +});